diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
index 86838836de..d9e6956735 100644
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -25,6 +25,18 @@ body:
       A placeholder for the command.
   validations:
     required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here.
+      2. You may add addition that may be helpful for locating the problem, such as
+         - How you installed PyTorch \[e.g., pip, conda, source\]
+         - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    placeholder: Environment here.
+    render: Shell
+  validations:
+    required: true
 - type: textarea
   attributes:
     label: Error traceback
diff --git a/README.md b/README.md
index a2de4d6ac0..7639675aba 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 
 ## Supported Models
 
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`.
+`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
 
 ### TurboMind
 
@@ -63,6 +63,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | :----------: | :-------------: | :--: | :-----: | :---: | :--: |
 |    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    SOLAR     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
@@ -118,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### Inference by TurboMind
 
 ```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -139,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
 #### Serving with gradio
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 Launch inference server by:
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
 ```
 
 Then, you can communicate with it by command line,
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 or webui,
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh
 Then, you can communicate with the inference server by command line,
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 or webui,
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -199,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
 #### Single GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 09c66c2826..38faad0583 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -53,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 
 ## 支持的模型
 
-`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端
+`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表
 
 ### TurboMind
 
@@ -64,6 +64,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | :----------: | :------: | :--: | :-----: | :---: | :--: |
 |    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|    SOLAR     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
@@ -119,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 
 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 
 ```
 
 #### 使用 turbomind 推理
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 > **Note**<br />
@@ -139,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 #### 启动 gradio server
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 使用下面的命令启动推理服务：
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 也可以通过 WebUI 方式来对话：
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh
 你可以通过命令行方式与推理服务进行对话：
 
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 
 也可以通过 WebUI 方式来对话：
 
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 
 其他模型的部署方式，比如 LLaMA，LLaMA-2，vicuna等等，请参考[这里](docs/zh_cn/serving.md)
@@ -203,7 +204,7 @@ pip install deepspeed
 #### 单个 GPU
 
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
     --max_new_tokens 64 \
     --temperture 0.8 \
     --top_p 0.95 \
diff --git a/benchmark/README.md b/benchmark/README.md
index b5573ae2b8..3fa117210e 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
 ```bash
 python profile_generation.py \
  --model-path /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```
 
 ## profile serving
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index e64a6708cd..325877f4e3 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -106,7 +106,7 @@ def _infer(model, session_id):
 
 def profile_throughput(model_path: str,
                        concurrency: int = 1,
-                       input_seqlen: int = 0,
+                       input_seqlen: int = 1,
                        output_seqlen: int = 512,
                        test_round: int = 10,
                        tp: int = 1,
@@ -133,8 +133,10 @@ def profile_throughput(model_path: str,
     )
 
     # make up a prompt that can be tokenized into {input_seqlen} tokens
-    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
+    assert input_seqlen > 0, 'input_seqlen should > 0'
+    prompt = 'hi'
     input_ids = tokenizer.encode(prompt)
+    input_ids = input_ids * input_seqlen
 
     warmup(tm_model,
            concurrency,
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index d1f6ebf80e..394c7ec1b9 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -1,98 +1,73 @@
 import json
-import multiprocessing as mp
 import random
 import time
-from typing import Iterable, List
+from queue import Queue
+from threading import Thread
 
 import fire
 import numpy as np
-import requests
 
+from lmdeploy.serve.openai.api_client import get_streaming_response
 from lmdeploy.tokenizer import Tokenizer
-from lmdeploy.utils import get_logger
-
-
-def get_streaming_response(prompt: str,
-                           api_url: str,
-                           session_id: int,
-                           request_output_len: int,
-                           stream: bool = True,
-                           sequence_start: bool = True,
-                           sequence_end: bool = False,
-                           ignore_eos: bool = False) -> Iterable[List[str]]:
-    headers = {'User-Agent': 'Test Client'}
-    pload = {
-        'prompt': prompt,
-        'stream': stream,
-        'session_id': session_id,
-        'request_output_len': request_output_len,
-        'sequence_start': sequence_start,
-        'sequence_end': sequence_end,
-        'ignore_eos': ignore_eos
-    }
-    response = requests.post(api_url,
-                             headers=headers,
-                             json=pload,
-                             stream=stream)
-    for chunk in response.iter_lines(chunk_size=8192,
-                                     decode_unicode=False,
-                                     delimiter=b'\n'):
-        if chunk:
-            data = json.loads(chunk.decode('utf-8'))
-            output = data['text']
-            tokens = data['tokens']
-            yield output, tokens
-
-
-def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
-          res_que: mp.Queue):
+
+
+def infer(server_addr: str, session_id: int, req_queue: Queue, res_que: Queue,
+          stream_output: bool):
     stats = []
-    while not req_queue.empty():
-        prompt, input_seqlen, output_seqlen = req_queue.get()
-        get_logger('profile_restful_api').info(
-            f'request info: session {session_id}, '
-            f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}')
+    for prompt, input_seqlen, output_seqlen in iter(req_queue.get,
+                                                    [None, None, None]):
+        if prompt is None:
+            break
         timestamps = []
         tokens = []
-        start = time.perf_counter()
-        for res, token in get_streaming_response(
+        timestamps.append(time.perf_counter())
+        for res, token, status in get_streaming_response(
                 prompt,
                 server_addr,
                 session_id,
                 request_output_len=output_seqlen,
-                sequence_start=True,
-                sequence_end=True):
+                interactive_mode=False,
+                ignore_eos=True,
+                stream=stream_output):
             timestamps.append(time.perf_counter())
             tokens.append(token)
 
-        first_token_latency = timestamps[1] - start
-        token_latency = timestamps[-1] - timestamps[0]
-        token = tokens[-1] - tokens[0]
-        stats.append([first_token_latency, token, token_latency])
+        first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
+        token_latency = np.round(timestamps[-1] - timestamps[0], 3)
+        completion_tokens = tokens[-1]
+        total_tokens = tokens[-1] + input_seqlen
+        stats.append([
+            first_token_latency, completion_tokens, output_seqlen,
+            total_tokens, token_latency
+        ])
+        print(f'session {session_id}: '
+              f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+              f'completion_tokens {completion_tokens}')
     res_que.put((session_id, stats))
 
 
 def warmup(server_addr: str,
            concurrency: int,
            output_seqlen: int,
-           warmup_round: int = 1):
+           warmup_round: int = 1,
+           stream_output: bool = False):
     print('start to warmup ...')
 
     def _infer(server_addr, session_id):
         for _ in range(warmup_round):
-            for _, _ in get_streaming_response(
-                    '',
-                    server_addr,
-                    session_id,
-                    request_output_len=output_seqlen,
-                    sequence_start=True,
-                    sequence_end=True):
+            for _ in get_streaming_response('',
+                                            server_addr,
+                                            session_id,
+                                            request_output_len=output_seqlen,
+                                            interactive_mode=False,
+                                            stream=stream_output,
+                                            ignore_eos=True):
                 continue
 
     _start = time.perf_counter()
     procs = []
     for i in range(concurrency):
-        proc = mp.Process(target=_infer, args=(server_addr, i + 1))
+        proc = Thread(target=_infer, args=(server_addr, i + 1))
         procs.append(proc)
         proc.start()
     for proc in procs:
@@ -115,6 +90,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
         print(f'elapsed time for read data: '
               f'{round(time.perf_counter() - start, 2)} s')
 
+    print('start tokenization. This takes a while, please wait...')
     start = time.perf_counter()
     tokenizer = Tokenizer(tokenizer_path)
     prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
@@ -136,9 +112,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
     if samples > 0:
         filtered_dataset = random.sample(filtered_dataset, samples)
 
-    que = mp.Queue()
+    que = Queue()
     for data in filtered_dataset:
         que.put(data)
+    que.put((None, None, None))
     print(f'elapsed time for filtering: '
           f'{round(time.perf_counter() - start, 2)} s')
     return que, len(filtered_dataset)
@@ -149,17 +126,20 @@ def main(server_addr: str,
          dataset_path: str,
          concurrency: int = 1,
          session_len: int = 2048,
-         samples: int = 1000):
-    api_url = server_addr + '/generate'
-    warmup(api_url, concurrency, session_len - 1)
+         samples: int = 1000,
+         stream_output: bool = False):
+    api_url = server_addr + '/v1/chat/interactive'
+    warmup(api_url, concurrency, session_len - 1, 4, stream_output)
     req_queue, n_req = read_dataset(tokenizer_path, dataset_path, samples,
                                     session_len)
-    res_que = mp.Queue()
+    for i in range(concurrency):
+        req_queue.put([None, None, None])
+    res_que = Queue()
     procs = []
     _start = time.perf_counter()
     for i in range(concurrency):
-        proc = mp.Process(target=infer,
-                          args=(api_url, i + 1, req_queue, res_que))
+        proc = Thread(target=infer,
+                      args=(api_url, i + 1, req_queue, res_que, stream_output))
         procs.append(proc)
         proc.start()
     for proc in procs:
@@ -174,22 +154,40 @@ def main(server_addr: str,
               f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
         stats.append(np.array(_stats))
 
-    stats = np.concatenate(stats).reshape(-1, 3)
+    stats = np.concatenate(stats).reshape(-1, 5)
 
     first_token_latency_min = np.min(stats[:, 0], axis=0)
     first_token_latency_max = np.max(stats[:, 0], axis=0)
     first_token_latency_ave = np.mean(stats[:, 0], axis=0)
-    token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
-    req_throughput = n_req / elapsed_time
+    completion_tokens = np.sum(stats[:, 1], axis=0)
+    request_output_tokens = np.sum(stats[:, 2], axis=0)
+    total_tokens = np.sum(stats[:, 3], axis=0)
+    prompt_tokens = total_tokens - completion_tokens
+    completion_token_throughput = completion_tokens / elapsed_time
+    total_token_throughput = total_tokens / elapsed_time
+    rqs = n_req / elapsed_time
+    rqm = rqs * 60
+
+    if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+        print(f'Did not generate requested number of tokens. '
+              f'Request {request_output_tokens:.0f}, '
+              f'but got {completion_tokens:.0f}')
 
     print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-          f'elapsed_time: {elapsed_time:.2f}s\n'
-          f'first_token latency(min, max, ave): '
-          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
-          f'{first_token_latency_ave:.2f}s\n'
-          f'token throughput: {token_throughput:.2f} token/s\n'
-          f'req throughput: {req_throughput:.2f} req/s\n'
-          f'{"-" * 50}\n')
+          f'elapsed_time: {elapsed_time:.3f}s\n')
+    if stream_output:
+        print(f'first_token latency(min, max, ave): '
+              f'{first_token_latency_min:.3f}s, '
+              f'{first_token_latency_max:.3f}s, '
+              f'{first_token_latency_ave:.3f}s\n')
+    print(
+        f'number of prompt tokens: {prompt_tokens:.0f}\n'
+        f'number of completion tokens: {completion_tokens:.0f}\n'
+        f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
+        f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
+        f'RPS (request per second): {rqs:.3f} req/s\n'
+        f'RPM (request per minute): {rqm:.3f} req/min\n'
+        f'{"-" * 50}\n')
 
 
 if __name__ == '__main__':
diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
index 4580757eeb..ee23452d8a 100644
--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -17,7 +17,7 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
                                                     [None, None, None]):
         timestamps = []
         tokens = []
-        start = time.perf_counter()
+        timestamps.append(time.perf_counter())
         for status, res, token in chatbot.stream_infer(
                 session_id,
                 prompt,
@@ -26,13 +26,17 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
                 sequence_end=True):
             timestamps.append(time.perf_counter())
             tokens.append(token)
-
-        first_token_latency = np.round(timestamps[1] - start, 3)
+        first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
         token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-        token = tokens[-1] - tokens[0]
-        stats.append([first_token_latency, token, token_latency])
+        completion_tokens = tokens[-1]
+        total_tokens = tokens[-1] + input_seqlen
+        stats.append([
+            first_token_latency, completion_tokens, output_seqlen,
+            total_tokens, token_latency
+        ])
         print(f'session {session_id}: '
-              f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}')
+              f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+              f'completion_tokens {completion_tokens}')
     res_que.put((session_id, stats))
 
 
@@ -84,6 +88,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
         completions = [completion for _, completion in dataset]
         print(f'elapsed time for read data: '
               f'{round(time.perf_counter() - start, 2)} s')
+    print('start tokenization. This takes a while, please wait...')
 
     start = time.perf_counter()
     tokenizer = Tokenizer(tokenizer_path)
@@ -124,7 +129,6 @@ def main(tritonserver_addr: str,
     res_que = mp.Queue()
 
     procs = []
-    _start = time.perf_counter()
     for i in range(concurrency):
         chatbot = Chatbot(tritonserver_addr=tritonserver_addr,
                           display=False,
@@ -134,13 +138,15 @@ def main(tritonserver_addr: str,
         proc = mp.Process(target=infer,
                           args=(chatbot, i + 1, req_que, res_que))
         procs.append(proc)
-        proc.start()
 
     # read data and put it to queue
     n_req = read_dataset(tokenizer_path, dataset_path, samples, session_len,
                          req_que)
     for i in range(concurrency):
         req_que.put([None, None, None])
+    _start = time.perf_counter()
+    for proc in procs:
+        proc.start()
 
     stats = []
     for i in range(concurrency):
@@ -149,27 +155,42 @@ def main(tritonserver_addr: str,
               f'session {session_id}: processed reqs {len(_stats)}, '
               f'stats: \n{_stats}\n{"-" * 50}\n')
         stats.append(np.array(_stats))
-
     _end = time.perf_counter()
+
     elapsed_time = _end - _start
 
-    stats = np.concatenate(stats).reshape(-1, 3)
+    stats = np.concatenate(stats).reshape(-1, 5)
 
     first_token_latency_min = np.min(stats[:, 0], axis=0)
     first_token_latency_max = np.max(stats[:, 0], axis=0)
     first_token_latency_ave = np.mean(stats[:, 0], axis=0)
-    token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
-    req_throughput = n_req / elapsed_time
-
-    print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
-          f'elapsed_time: {elapsed_time:.3f}s\n'
-          f'first_token latency(min, max, ave): '
-          f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, '
-          f'{first_token_latency_ave:.3f}s\n'
-          f'token throughput: {token_throughput:.3f} token/s\n'
-          f'req throughput: {req_throughput:.3f} req/s\n'
-          f'{"-" * 50}\n')
-
+    completion_tokens = np.sum(stats[:, 1], axis=0)
+    request_output_tokens = np.sum(stats[:, 2], axis=0)
+    total_tokens = np.sum(stats[:, 3], axis=0)
+    prompt_tokens = total_tokens - completion_tokens
+    completion_token_throughput = completion_tokens / elapsed_time
+    total_token_throughput = total_tokens / elapsed_time
+    rqs = n_req / elapsed_time
+    rqm = rqs * 60
+
+    if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+        print(f'Did not generate requested number of tokens. '
+              f'Request {request_output_tokens:.0f}, '
+              f'but got {completion_tokens:.0f}')
+
+    print(
+        f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
+        f'elapsed_time: {elapsed_time:.3f}s\n'
+        f'first_token latency(min, max, ave): '
+        f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, '
+        f'{first_token_latency_ave:.3f}s\n'
+        f'number of prompt tokens: {prompt_tokens:.0f}\n'
+        f'number of completion tokens: {completion_tokens:.0f}\n'
+        f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
+        f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
+        f'RPS (request per second): {rqs:.3f} req/s\n'
+        f'RPM (request per minute): {rqm:.3f} req/min\n'
+        f'{"-" * 50}\n')
     for proc in procs:
         proc.join()
 
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 610fbb7657..77a0b6f242 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -8,6 +8,7 @@
 from typing import List, Tuple
 
 import fire
+import numpy as np
 
 from lmdeploy.tokenizer import Tokenizer
 
@@ -80,88 +81,137 @@ def __init__(self, model_path: str, tp: int = 1):
         self.tm_model = tm_model
         self.tokenizer = tokenizer
 
-    def _inference(self, queue, session_id: int):
-
+    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
+                   stream_output: bool):
         model_inst = self.tm_model.create_instance()
-        while True:
-            request = queue.get()
-            if request is None:
-                # stop signal
-                queue.put(None)
-                return
-            else:
-                prompt, _, output_seqlen = request
-                input_ids = self.tokenizer.encode(prompt)
-
-                for outputs in model_inst.stream_infer(
-                        session_id,
-                        input_ids=input_ids,
-                        request_output_len=output_seqlen,
-                        temperature=1.0,
-                        top_p=1.0,
-                        sequence_start=True,
-                        sequence_end=True,
-                        ignore_eos=True,
-                        sampling_param=self.sampling_param):
-                    if len(outputs) > 1:
-                        res, tokens = outputs[-2:]
-                    else:
-                        res, tokens = outputs[0]
-                    self.tokenizer.decode(res)
-
-                # for pytorch engine to restart a session
-                if hasattr(model_inst, 'end'):
-                    model_inst.end(session_id)
-
-    def process_request(self, requests, concurrency: int = 1):
-        q = Queue()
+        stats = []
+        timestamps = []
+        tokens = []
+        timestamps.append(time.perf_counter())
+        for prompt, input_seqlen, output_seqlen in iter(
+                req_queue.get, [None, None, None]):
+            input_ids = self.tokenizer.encode(prompt)
+            offset = 0
+            for outputs in model_inst.stream_infer(
+                    session_id,
+                    input_ids=input_ids,
+                    request_output_len=output_seqlen,
+                    temperature=1.0,
+                    top_p=1.0,
+                    sequence_start=True,
+                    sequence_end=True,
+                    ignore_eos=True,
+                    stream_output=stream_output):
+                if len(outputs) > 1:
+                    res, token = outputs[-2:]
+                else:
+                    res, token = outputs[0]
+                self.tokenizer.decode(res, offset)
+                offset = token
+                timestamps.append(time.perf_counter())
+                tokens.append(token)
+            # for pytorch engine to restart a session
+            if hasattr(model_inst, 'end'):
+                model_inst.end(session_id)
+            first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
+            token_latency = np.round(timestamps[-1] - timestamps[0], 3)
+            completion_tokens = tokens[-1]
+            total_tokens = tokens[-1] + len(input_ids)
+            stats.append([
+                first_token_latency, completion_tokens, output_seqlen,
+                total_tokens, token_latency
+            ])
+            print(
+                f'session {session_id}: '
+                f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+                f'completion_tokens {completion_tokens}')
+        res_queue.put((session_id, stats))
+
+    def process_request(self,
+                        requests,
+                        concurrency: int = 1,
+                        stream_output: bool = True):
+        res_queue = Queue()
+        req_queue = Queue()
         threads = []
 
+        # feed request to q
+        for req in requests:
+            req_queue.put(req)
+        for i in range(concurrency):
+            req_queue.put([None, None, None])
+
         start = time.time()
 
         # start threads
         for i in range(concurrency):
-            t = Thread(target=self._inference, args=(q, i))
+            t = Thread(target=self._inference,
+                       args=(req_queue, res_queue, i, stream_output))
             t.start()
             threads.append(t)
 
-        # feed request to q
-        for req in requests:
-            q.put(req)
-
-        q.put(None)
-
         # wait for finish
         for t in threads:
             t.join()
 
-        end = time.time()
-
-        return end - start
+        elapsed_time = time.time() - start
+
+        stats = []
+        while not res_queue.empty():
+            session_id, _stats = res_queue.get()
+            print(f'\n{"-" * 50}\n'
+                  f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
+            stats.append(np.array(_stats))
+
+        stats = np.concatenate(stats).reshape(-1, 5)
+
+        first_token_latency_min = np.min(stats[:, 0], axis=0)
+        first_token_latency_max = np.max(stats[:, 0], axis=0)
+        first_token_latency_ave = np.mean(stats[:, 0], axis=0)
+        completion_tokens = np.sum(stats[:, 1], axis=0)
+        request_output_tokens = np.sum(stats[:, 2], axis=0)
+        total_tokens = np.sum(stats[:, 3], axis=0)
+        prompt_tokens = total_tokens - completion_tokens
+        completion_token_throughput = completion_tokens / elapsed_time
+        total_token_throughput = total_tokens / elapsed_time
+        rqs = len(requests) / elapsed_time
+        rqm = rqs * 60
+
+        if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+            print(f'Did not generate requested number of tokens. '
+                  f'Request {request_output_tokens:.0f}, '
+                  f'but got {completion_tokens:.0f}')
+
+        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
+              f'elapsed_time: {elapsed_time:.3f}s\n')
+        if stream_output:
+            print(f'first_token latency(min, max, ave): '
+                  f'{first_token_latency_min:.3f}s, '
+                  f'{first_token_latency_max:.3f}s, '
+                  f'{first_token_latency_ave:.3f}s\n')
+        print(
+            f'number of prompt tokens: {prompt_tokens:.0f}\n'
+            f'number of completion tokens: {completion_tokens:.0f}\n'
+            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
+            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
+            f'RPS (request per second): {rqs:.3f} req/s\n'
+            f'RPM (request per minute): {rqm:.3f} req/min\n'
+            f'{"-" * 50}\n')
 
 
 def main(dataset: str,
          model_path: str,
          concurrency: int = 1,
          num_prompts: int = 1000,
-         tp: int = 1):
+         tp: int = 1,
+         stream_output: bool = True):
 
     engine = Engine(model_path, tp=tp)
     tokenizer = engine.tokenizer
 
     requests = sample_requests(dataset, num_prompts, tokenizer)
 
-    elapsed_time = engine.process_request(requests, concurrency)
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-    total_num_out_tokens = sum(output_len for _, _, output_len in requests)
-    print(f'Throughput requests: {len(requests) / elapsed_time:.2f} req/s')
-    print(
-        f'Throughput requests: {len(requests) * 60 / elapsed_time:.2f} req/min'
-    )
-    print(f'Throughput tokens: {total_num_tokens / elapsed_time:.2f} tokens/s')
-    print('Throughput tokens(output only):'
-          f'{total_num_out_tokens / elapsed_time:.2f} tokens/s')
+    engine.process_request(requests, concurrency, stream_output)
 
 
 if __name__ == '__main__':
diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
index abb90562a2..8d1eb16de9 100755
--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -11,7 +11,7 @@ source /opt/conda/bin/activate
 conda activate $PYTHON_VERSION
 
 cd lmdeploy
-mkdir build && cd build
+mkdir -p build && cd build && rm -rf *
 bash ../generate.sh
 make -j$(nproc) && make install
 if [ $? != 0 ]; then
diff --git a/docs/en/build.md b/docs/en/build.md
index 7ee53ac90c..cb278073c9 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -1,22 +1,79 @@
-## Build from source
+# Build from source
 
-- install packages for compiling and running:
+LMDeploy provides prebuilt package that can be easily installed by `pip install lmdeploy`.
 
-  ```shell
-  conda create -n lmdeploy python=3.10
-  conda activate lmdeploy
+If you have requests to build lmdeploy from source, please clone lmdeploy repository from GitHub, and follow instructions in next sections
 
-  git clone https://github.com/InternLM/lmdeploy.git
-  cd lmdeploy
+```shell
+git clone --depth=1 https://github.com/InternLM/lmdeploy
+```
 
-  pip install -r requirements.txt
-  conda install openmpi-mpicxx nccl rapidjson -c conda-forge
-  ```
+## Build in Docker (recommended)
+
+We highly advise using the provided docker image for lmdeploy build to circumvent complex environment setup.
+
+The docker image is `openmmlab/lmdeploy-builder:cuda11.8`. Make sure that docker is installed before using this image.
+
+In the root directory of the lmdeploy source code, please run the following command:
+
+```shell
+cd lmdeploy # the home folder of lmdeploy source code
+bash builder/manywheel/build_all_wheel.sh
+```
+
+All the wheel files for lmdeploy under py3.8 - py3.11 will be found in the `builder/manywheel/cuda11.8_dist` directory, such as,
+
+```text
+builder/manywheel/cuda11.8_dist/
+├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl
+└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl
+```
+
+If the wheel file for a specific Python version is required, such as py3.8, please execute:
+
+```shell
+bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist
+```
+
+And the wheel file will be found in the `builder/manywheel/cuda11.8_dist` directory.
+
+You can use `pip install` to install the wheel file that matches the Python version on your host machine.
 
-- build and install lmdeploy:
+## Build in localhost (optional)
 
+Firstly, please make sure gcc version is no less than 9, which can be conformed by `gcc --version`.
+
+Then, follow the steps below to set up the compilation environment:
+
+- install the dependent packages:
+  ```shell
+  pip install -r requirements.txt
+  apt-get install rapidjson-dev
+  ```
+- install [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), and set environment variables:
+  ```shell
+  export NCCL_ROOT_DIR=/path/to/nccl/build
+  export NCCL_LIBRARIES=/path/to/nccl/build/lib
+  ```
+- install openmpi from source:
+  ```shell
+  wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
+  tar xf openmpi-4.1.5.tar.gz
+  cd openmpi-4.1.5
+  ./configure
+  make -j$(nproc) && make install
+  ```
+- build and install lmdeploy libraries:
   ```shell
+  cd lmdeploy # the home folder of lmdeploy
   mkdir build && cd build
   sh ../generate.sh
   make -j$(nproc) && make install
   ```
+- install lmdeploy python package:
+  ```shell
+  cd ..
+  pip install -e .
+  ```
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index 1f5f5aa125..5dcf43ba68 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory.
 
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 
 If you already have a workspace directory, skip this step.
@@ -29,7 +29,7 @@ Get the quantization parameters by these two steps:
 
 ```bash
 # get minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # Support c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
   --work_dir $WORK_DIR \             # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
 
 # get quant parameters
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
   --work_dir $WORK_DIR  \                             # Directory of the last output
   --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
   --kv_sym False \                                    # Symmetric or asymmetric quantization, default is False
@@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre
 Test the chat performance.
 
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## GPU Memory Test
diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md
index e3662ab373..e4cd5a9cbe 100644
--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and
 **Example 1**: Chat with default setting
 
 ```shell
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
+lmdeploy chat torch $PATH_TO_HF_MODEL
 ```
 
 **Example 2**: Disable sampling and chat history
 
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
     $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
     --temperature 0 --max-history 0
 ```
@@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \
 **Example 3**: Accelerate with deepspeed inference
 
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
     $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
     --accel deepspeed
 ```
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index cb70e26375..7f49edce1e 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -3,56 +3,61 @@
 ### Launch Service
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage.
-We provide four restful api in total. Three of them are in OpenAI format. However, we recommend users try
-our own api which provides more arguments for users to modify. The performance is comparatively better.
+We provide four restful api in total. Three of them are in OpenAI format.
+
+- /v1/chat/completions
+- /v1/models
+- /v1/completions
+
+However, we recommend users try
+our own api `/v1/chat/interactive` which provides more arguments for users to modify. The performance is comparatively better.
+
+**Note** please, if you want to launch multiple requests, you'd better set different `session_id` for both
+`/v1/chat/completions` and `/v1/chat/interactive` apis. Or, we will set them random values.
 
 ### python
 
-Here is an example for our own api `generate`.
+We have integrated the client-side functionalities of these services into the `APIClient` class. Below are some examples demonstrating how to invoke the `api_server` service on the client side.
+
+If you want to use the `/v1/chat/completions` endpoint, you can try the following code:
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+messages = [{"role": "user", "content": "Say this is a test!"}]
+for item in api_client.chat_completions_v1(model=model_name, messages=messages):
+    print(item)
+```
+
+For the `/v1/completions` endpoint. If you want to use the `/v1/completions` endpoint, you can try:
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+for item in api_client.completions_v1(model=model_name, prompt='hi'):
+    print(item)
+```
+
+Lmdeploy supports maintaining session histories on the server for `/v1/chat/interactive` api. We disable the
+feature by default.
+
+- On interactive mode, the chat history is kept on the server. In a multiple rounds of conversation, you should set
+  `interactive_mode = True` and the same `session_id` (can't be -1, it's the default number) to `/v1/chat/interactive` for requests.
+- On normal mode, no chat history is kept on the server.
+
+The interactive mode can be controlled by the `interactive_mode` boolean parameter. The following is an example of normal mode. If you want to experience the interactive mode, simply pass in `interactive_mode=True`.
 
 ```python
-import json
-import requests
-from typing import Iterable, List
-
-
-def get_streaming_response(prompt: str,
-                           api_url: str,
-                           session_id: int,
-                           request_output_len: int,
-                           stream: bool = True,
-                           sequence_start: bool = True,
-                           sequence_end: bool = True,
-                           ignore_eos: bool = False) -> Iterable[List[str]]:
-    headers = {'User-Agent': 'Test Client'}
-    pload = {
-        'prompt': prompt,
-        'stream': stream,
-        'session_id': session_id,
-        'request_output_len': request_output_len,
-        'sequence_start': sequence_start,
-        'sequence_end': sequence_end,
-        'ignore_eos': ignore_eos
-    }
-    response = requests.post(
-        api_url, headers=headers, json=pload, stream=stream)
-    for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
-        if chunk:
-            data = json.loads(chunk.decode('utf-8'))
-            output = data['text']
-            tokens = data['tokens']
-            yield output, tokens
-
-
-for output, tokens in get_streaming_response(
-        "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0,
-        512):
-    print(output, end='')
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+for item in api_client.generate(prompt='hi'):
+    print(item)
 ```
 
 ### Java/Golang/Rust
@@ -84,16 +89,15 @@ List Models:
 curl http://{server_ip}:{server_port}/v1/models
 ```
 
-Generate:
+Interactive Chat:
 
 ```bash
-curl http://{server_ip}:{server_port}/generate \
+curl http://{server_ip}:{server_port}/v1/chat/interactive \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
     "session_id": 1,
-    "sequence_start": true,
-    "sequence_end": true
+    "interactive_mode": true
   }'
 ```
 
@@ -104,19 +108,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "internlm-chat-7b",
-    "messages": [{"role": "user", "content": "Hello! Ho are you?"}]
+    "messages": [{"role": "user", "content": "Hello! How are you?"}]
   }'
 ```
 
-Embeddings:
+Text Completions:
 
-```bash
-curl http://{server_ip}:{server_port}/v1/embeddings \
-  -H "Content-Type: application/json" \
+```shell
+curl http://{server_ip}:{server_port}/v1/completions \
+  -H 'Content-Type: application/json' \
   -d '{
-    "model": "internlm-chat-7b",
-    "input": "Hello world!"
-  }'
+  "model": "llama",
+  "prompt": "two steps to build a house:"
+}'
 ```
 
 ### CLI client
@@ -125,7 +129,7 @@ There is a client script for restful api server.
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 ### webui
@@ -133,10 +137,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 You can also test restful-api through webui.
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 ### FAQ
@@ -146,10 +150,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. When OOM appeared at the server side, please reduce the number of `instance_num` when lanching the service.
 
-3. When the request with the same `session_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
-
-4. Requests were previously being handled sequentially rather than concurrently. To resolve this issue,
-
-   - kindly provide unique session_id values when calling the `generate` API or else your requests may be associated with client IP addresses
+3. When the request with the same `session_id` to `/v1/chat/interactive` got a empty return value and a negative `tokens`, please consider setting `interactive_mode=false` to restart the session.
 
-5. Both `generate` api and `v1/chat/completions` upport engaging in multiple rounds of conversation, where input `prompt` or `messages` consists of either single strings or entire chat histories.These inputs are interpreted using multi-turn dialogue modes. However, ff you want to turn the mode of and manage the chat history in clients, please the parameter `sequence_end: true` when utilizing the `generate` function, or specify `renew_session: true` when making use of `v1/chat/completions`
+4. The `/v1/chat/interactive` api disables engaging in multiple rounds of conversation by default. The input argument `prompt` consists of either single strings or entire chat histories.
diff --git a/docs/en/serving.md b/docs/en/serving.md
index 1e6f783d7a..6cc18018d0 100644
--- a/docs/en/serving.md
+++ b/docs/en/serving.md
@@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
 
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
 
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
 
@@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
 
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```
 
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
index 1b51402056..78f4d2ce5d 100644
--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu
 python3 -m pip install lmdeploy
 
 # convert weight layout
-python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+lmdeploy convert codellama /the/path/of/codellama/model
 ```
 
 Then, you can communicate with codellama in consolo by following instructions in next sections
@@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in
 ### Completion
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 
 ### Infilling
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 
 The input code is supposed to have a special placeholder `<FILL>`. For example,
@@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in `<FI
 ### Chat
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 
 `--sys-instruct` instruction can be changed to other coding languages as long as codellama supports it
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python specialist
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 
 Python fine-tuned model is highly recommended when 'python specialist' capability is required.
@@ -90,23 +90,23 @@ Launch inference server by:
 ```shell
 # --instance_num: number of instances to performance inference, which can be viewed as max requests concurrency
 # --tp: the number of GPUs used in tensor parallelism
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name ${server_ip} --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 Then, you can communicate with it by command line,
 
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
 ```
 
 or through webui after launching gradio,
 
 ```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 Regarding the detailed information of RESTful API, you can refer to [restful_api.md](../restful_api.md).
diff --git a/docs/en/w4a16.md b/docs/en/w4a16.md
index 96bde48571..d9233f8016 100644
--- a/docs/en/w4a16.md
+++ b/docs/en/w4a16.md
@@ -26,14 +26,14 @@ As demonstrated in the command below, first convert the model's layout using `tu
 ```shell
 
 ## Convert the model's layout and store it in the default path, ./workspace.
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
     --model-name llama2 \
     --model-path ./llama2-chat-7b-w4 \
     --model-format awq \
     --group-size 128
 
 ## inference
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## Serve with gradio
@@ -41,7 +41,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 If you wish to interact with the model via web ui, please initiate the gradio server as indicated below:
 
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 
 Subsequently, you can open the website `http://{ip_addr}:{port}` in your browser and interact with the model
@@ -84,7 +84,7 @@ It includes two steps:
 ### Step 1: Generate Quantization Parameter
 
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # Calibration dataset, supports c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # Number of samples in the calibration set, if memory is insufficient, you can appropriately reduce this
@@ -97,7 +97,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy employs AWQ algorithm for model weight quantization.
 
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
   --model $HF_MODEL \
   --w_bits 4 \                       # Bit number for weight quantization
   --w_group_size 128 \               # Group size for weight quantization statistics
diff --git a/docs/zh_cn/build.md b/docs/zh_cn/build.md
index d97bab7196..a73296354b 100644
--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -1,22 +1,79 @@
-### 源码安装
+# 编译和安装
 
-- 安装编译和运行依赖包：
+LMDeploy 提供了预编译包，可以很方便的通过 `pip install lmdeploy` 安装和使用。
 
-  ```shell
-  conda create -n lmdeploy python=3.10
-  conda activate lmdeploy
+如果有源码编译的需求，请先下载 lmdeploy 源码：
+
+```shell
+git clone --depth=1 https://github.com/InternLM/lmdeploy
+```
+
+然后，参考以下章节编译和安装。
+
+## 在 docker 内编译安装（强烈推荐）
+
+LMDeploy 提供了编译镜像 `openmmlab/lmdeploy-builder:cuda11.8`。使用之前，请确保 docker 已安装。
+
+在 lmdeploy 源码的根目录下，运行以下命令：
+
+```shell
+cd lmdeploy # lmdeploy 源码根目录
+bash builder/manywheel/build_all_wheel.sh
+```
+
+即可在 `builder/manywheel/cuda11.8_dist` 文件夹下，得到 lmdeploy 在 py3.8 - py3.11 下所有的 wheel 文件。比如，
+
+```text
+builder/manywheel/cuda11.8_dist/
+├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl
+└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl
+```
+
+如果需要固定 python 版本的 wheel 文件，比如 py3.8，可以执行：
+
+```shell
+bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist
+```
+
+wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
+
+在宿主机上，通过 `pip install` 安装和宿主机python版本一致的 wheel 文件，即完成 lmdeploy 整个编译安装过程。
 
-  git clone https://github.com/InternLM/lmdeploy.git
-  cd lmdeploy
+## 在物理机上编译安装（可选）
 
+首先，请确保物理机环境的 gcc 版本不低于 9，可以通过`gcc --version`确认。
+
+然后，按如下步骤，配置编译环境：
+
+- 安装编译和运行依赖包：
+  ```shell
   pip install -r requirements.txt
-  conda install openmpi-mpicxx nccl rapidjson -c conda-forge
+  apt-get install rapidjson-dev
+  ```
+- 安装 [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html),设置环境变量
+  ```shell
+  export NCCL_ROOT_DIR=/path/to/nccl/build
+  export NCCL_LIBRARIES=/path/to/nccl/build/lib
+  ```
+- 源码编译安装 openmpi:
+  ```shell
+  wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
+  tar xf openmpi-4.1.5.tar.gz
+  cd openmpi-4.1.5
+  ./configure
+  make -j$(nproc) && make install
   ```
-
 - lmdeploy 编译安装:
-
   ```shell
+  cd lmdeploy # lmdeploy 源码的根目录
   mkdir build && cd build
   sh ../generate.sh
   make -j$(nproc) && make install
   ```
+- 安装 lmdeploy python package:
+  ```shell
+  cd ..
+  pip install -e .
+  ```
diff --git a/docs/zh_cn/kv_int8.md b/docs/zh_cn/kv_int8.md
index 3e006c6135..75e58e1cbf 100644
--- a/docs/zh_cn/kv_int8.md
+++ b/docs/zh_cn/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 把 huggingface 格式的模型，转成 turbomind 推理格式，得到一个 workspace 目录
 
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 
 如果已经有 workspace 目录，可以跳过这步。
@@ -29,7 +29,7 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
 
 ```bash
 # 计算 minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
   --work_dir $WORK_DIR \             # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹
 
 # 通过 minmax 获取量化参数
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
   --work_dir $WORK_DIR  \                             # 上一步的结果
   --turbomind_dir workspace/triton_models/weights/ \ # 保存量化参数的目录，推理要用
   --kv_sym False \                                    # 对称量化或非对称量化，默认为 False
@@ -64,7 +64,7 @@ python3 -m lmdeploy.lite.apis.kv_qparams \
 测试聊天效果
 
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## 显存测试
diff --git a/docs/zh_cn/restful_api.md b/docs/zh_cn/restful_api.md
index 2b56fa0f26..409e29c647 100644
--- a/docs/zh_cn/restful_api.md
+++ b/docs/zh_cn/restful_api.md
@@ -5,56 +5,56 @@
 运行脚本
 
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 然后用户可以打开 swagger UI: `http://{server_ip}:{server_port}` 详细查看所有的 API 及其使用方法。
-我们一共提供四个 restful api，其中三个仿照 OpenAI 的形式。不过，我们建议用户用我们提供的另一个 API: `generate`。
+我们一共提供四个 restful api，其中三个仿照 OpenAI 的形式。
+
+- /v1/chat/completions
+- /v1/models
+- /v1/completions
+
+不过，我们建议用户用我们提供的另一个 API: `/v1/chat/interactive`。
 它有更好的性能，提供更多的参数让用户自定义修改。
 
 ### python
 
-这是一个 python 示例，展示如何使用 `generate`。
+我们将这些服务的客户端功能集成在 `APIClient` 类中。下面是一些例子，展示如何在客户端调用 `api_server` 服务。
+如果你想用 `/v1/chat/completions` 接口，你可以尝试下面代码：
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+messages = [{"role": "user", "content": "Say this is a test!"}]
+for item in api_client.chat_completions_v1(model=model_name, messages=messages):
+    print(item)
+```
+
+如果你想用 `/v1/completions` 接口，你可以尝试：
 
 ```python
-import json
-import requests
-from typing import Iterable, List
-
-
-def get_streaming_response(prompt: str,
-                           api_url: str,
-                           session_id: int,
-                           request_output_len: int,
-                           stream: bool = True,
-                           sequence_start: bool = True,
-                           sequence_end: bool = True,
-                           ignore_eos: bool = False) -> Iterable[List[str]]:
-    headers = {'User-Agent': 'Test Client'}
-    pload = {
-        'prompt': prompt,
-        'stream': stream,
-        'session_id': session_id,
-        'request_output_len': request_output_len,
-        'sequence_start': sequence_start,
-        'sequence_end': sequence_end,
-        'ignore_eos': ignore_eos
-    }
-    response = requests.post(
-        api_url, headers=headers, json=pload, stream=stream)
-    for chunk in response.iter_lines(
-            chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
-        if chunk:
-            data = json.loads(chunk.decode('utf-8'))
-            output = data['text']
-            tokens = data['tokens']
-            yield output, tokens
-
-
-for output, tokens in get_streaming_response(
-        "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0,
-        512):
-    print(output, end='')
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+for item in api_client.completions_v1(model=model_name, prompt='hi'):
+    print(item)
+```
+
+LMDeploy 的 `/v1/chat/interactive` api 支持将对话内容管理在服务端，但是我们默认关闭。如果想尝试，请阅读以下介绍：
+
+- 交互模式下，对话历史保存在 server。在一次完整的多轮对话中，所有请求设置`interactive_mode = True`, `session_id`保持相同 (不为 -1，这是缺省值)。
+- 非交互模式下，server 不保存历史记录。
+
+交互模式可以通过 `interactive_mode` 布尔量参数控制。下面是一个普通模式的例子，
+如果要体验交互模式，将 `interactive_mode=True` 传入即可。
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+for item in api_client.generate(prompt='hi'):
+    print(item)
 ```
 
 ### Java/Golang/Rust
@@ -86,16 +86,15 @@ cURL 也可以用于查看 API 的输出结果
 curl http://{server_ip}:{server_port}/v1/models
 ```
 
-使用 generate:
+Interactive Chat:
 
 ```bash
-curl http://{server_ip}:{server_port}/generate \
+curl http://{server_ip}:{server_port}/v1/chat/interactive \
   -H "Content-Type: application/json" \
   -d '{
     "prompt": "Hello! How are you?",
     "session_id": 1,
-    "sequence_start": true,
-    "sequence_end": true
+    "interactive_mode": true
   }'
 ```
 
@@ -106,19 +105,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "model": "internlm-chat-7b",
-    "messages": [{"role": "user", "content": "Hello! Ho are you?"}]
+    "messages": [{"role": "user", "content": "Hello! How are you?"}]
   }'
 ```
 
-Embeddings:
+Text Completions:
 
-```bash
-curl http://{server_ip}:{server_port}/v1/embeddings \
-  -H "Content-Type: application/json" \
+```shell
+curl http://{server_ip}:{server_port}/v1/completions \
+  -H 'Content-Type: application/json' \
   -d '{
-    "model": "internlm-chat-7b",
-    "input": "Hello world!"
-  }'
+  "model": "llama",
+  "prompt": "two steps to build a house:"
+}'
 ```
 
 ### CLI client
@@ -126,8 +125,8 @@ curl http://{server_ip}:{server_port}/v1/embeddings \
 restful api 服务可以通过客户端测试，例如
 
 ```shell
-# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+lmdeploy serve api_client api_server_url
 ```
 
 ### webui
@@ -135,10 +134,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 也可以直接用 webui 测试使用 restful-api。
 
 ```shell
-# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# api_server_url 就是 api_server 产生的，比如 http://localhost:23333
+# server_name 和 server_port 是用来提供 gradio ui 访问服务的
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 ### FAQ
@@ -148,12 +147,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
 
 2. 当服务端显存 OOM 时，可以适当减小启动服务时的 `instance_num` 个数
 
-3. 当同一个 `session_id` 的请求给 `generate` 函数后，出现返回空字符串和负值的 `tokens`，应该是第二次问话没有设置 `sequence_start=false`
-
-4. 如果感觉请求不是并发地被处理，而是一个一个地处理，请设置好以下参数：
-
-   - 不同的 session_id 传入 `generate` api。否则，我们将自动绑定会话 id 为请求端的 ip 地址编号。
+3. 当同一个 `session_id` 的请求给 `/v1/chat/interactive` 函数后，出现返回空字符串和负值的 `tokens`，应该是 `session_id` 混乱了，可以先将交互模式关闭，再重新开启。
 
-5. `generate` api 和 `v1/chat/completions` 均支持多轮对话。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问，也可以是一段对话历史。
-   两个 api 都是默认开启多伦对话的，如果你想关闭这个功能，然后在客户端管理会话记录，请设置 `sequence_end: true` 传入 `generate`，或者设置
-   `renew_session: true` 传入 `v1/chat/completions`。
+4. `/v1/chat/interactive` api 支持多轮对话, 但是默认关闭。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问，也可以是一段对话历史。
diff --git a/docs/zh_cn/serving.md b/docs/zh_cn/serving.md
index e0a2f5a986..db4ebb8d3c 100644
--- a/docs/zh_cn/serving.md
+++ b/docs/zh_cn/serving.md
@@ -8,7 +8,7 @@
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
 
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
 
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
 
@@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh
 <summary><b>7B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
     --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
     --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-7b \
   --delta-path lmsys/vicuna-7b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
 
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
   --target-model-path /path/to/vicuna-13b \
   --delta-path lmsys/vicuna-13b-delta-v1.1
 
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```
 
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
index ca9029a527..017df62b5f 100644
--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -29,7 +29,7 @@
 python3 -m pip install lmdeploy
 
 # 转模型格式
-python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+lmdeploy convert codellama /path/of/codellama/model
 ```
 
 接下来，可参考如下章节，在控制台与 codellama 进行交互式对话。
@@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
 ### 代码续写
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 
 ### 代码填空
 
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 
 输入的代码块中要包含 `<FILL>`，比如：
@@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str:
 ### 对话
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 
 可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python 专项
 
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 
 建议这里部署 Python 微调模型
@@ -90,7 +90,7 @@ TBD
 ```shell
 # --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
 # --tp: 在 tensor parallel时，使用的GPU数量
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 
 打开 `http://{server_ip}:{server_port}`，即可访问 swagger，查阅 RESTful API 的详细信息。
@@ -98,17 +98,17 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --
 你可以用命令行，在控制台与 server 通信：
 
 ```shell
-# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+# api_server_url 就是 api_server 产生的，比如 http://localhost:23333
+lmdeploy serve api_client api_server_url
 ```
 
 或者，启动 gradio，在 webui 的聊天对话框中，与 codellama 交流：
 
 ```shell
-# restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
+# api_server_url 就是 api_server 产生的，比如 http://localhost:23333
 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
 ```
 
 关于 RESTful API的详细介绍，请参考[这份](../restful_api.md)文档。
diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
index 68cc094df8..e0a220eb60 100644
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
 ```shell
 
 ## 转换模型的layout，存放在默认路径 ./workspace 下
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
     --model-name llama2 \
     --model-path ./llama2-chat-7b-w4 \
     --model-format awq \
     --group-size 128
 
 ## 推理
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 
 ## 启动 gradio 服务
@@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 如果想通过 webui 与模型对话，请执行以下命令启动 gradio 服务
 
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 
 然后，在浏览器中打开 http://{ip_addr}:{port}，即可在线对话
@@ -82,7 +82,7 @@ python benchmark/profile_generation.py \
 ### 第一步：生成量化参数
 
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
   --model $HF_MODEL \
   --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
   --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -95,7 +95,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时，需要把步骤1的`$WORK_DIR`传入。量化结束后，权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明，进行模型推理。
 
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
   --model $HF_MODEL \
   --w_bits 4 \                       # 权重量化的 bit 数
   --w_group_size 128 \               # 权重量化分组统计尺寸
diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py
new file mode 100644
index 0000000000..3575bec5bd
--- /dev/null
+++ b/lmdeploy/cli/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cli import run
+
+__all__ = ['run']
diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
new file mode 100644
index 0000000000..735b24c7cc
--- /dev/null
+++ b/lmdeploy/cli/chat.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+
+class SubCliChat(object):
+    """Chat through terminal with pytorch or turbomind model."""
+
+    def torch(self,
+              model_path: str,
+              tokenizer_path: Optional[str] = None,
+              accel: Optional[str] = None,
+              max_new_tokens: int = 128,
+              temperature: float = 0.8,
+              top_p: float = 0.95,
+              seed: int = 0,
+              use_fast_tokenizer: bool = True,
+              max_alloc: int = 2048,
+              max_session_len: int = None,
+              log_file: Optional[str] = None,
+              debug: bool = False,
+              adapter: Optional[str] = None):
+        """Chat with pytorch model through terminal.
+
+        Args:
+            model_path (str): Path to pytorch model.
+            tokenizer_path (str): Path to tokenizer.
+            accel (str): Model accelerator.
+            max_new_tokens (int): Maximum number of tokens to generate.
+            temperature (float): Temperature for sampling.
+            top_p (float): Top p for sampling.
+            seed (int): Random seed.
+            use_fast_tokenizer (bool): Whether to use fast tokenizer.
+                This argument is directly pass to transformer's
+                ``AutoTokenizer.from_pretrained``.
+                Generally, user should choose to use fast tokenizers.
+                But if using fast raise some error, try to force using a slow one.
+            max_alloc (int): Maximum memory to allocate (for deepspeed).
+            max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+                This include both history and current session.
+            log_file (str): Path to log file.
+            debug (bool): Whether to enable debug mode.
+            adapter (str): Force to use an adapter.
+                Generally user should not use this argument because adapter is selected based
+                on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+                based on `LlamaforCausalLM` class, this argument is required.
+                Currently, only "llama1" is acceptable for llama1 models.
+        """  # noqa: E501
+        from lmdeploy.pytorch.chat import main as run_torch_model
+
+        run_torch_model(model_path,
+                        tokenizer_path=tokenizer_path,
+                        accel=accel,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        seed=seed,
+                        use_fast_tokenizer=use_fast_tokenizer,
+                        max_alloc=max_alloc,
+                        max_session_len=max_session_len,
+                        log_file=log_file,
+                        debug=debug,
+                        adapter=adapter)
+
+    def turbomind(self,
+                  model_path,
+                  session_id: int = 1,
+                  cap: str = 'chat',
+                  tp=1,
+                  stream_output=True,
+                  **kwargs):
+        """Chat with turbomind model through terminal.
+
+        Args:
+            model_path (str): the path of the deployed model
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama has
+                the ability among ['completion', 'infilling', 'chat', 'python']
+            tp (int): GPU number used in tensor parallelism
+            stream_output (bool): indicator for streaming output or not
+            **kwarg (dict): other arguments for initializing model's chat
+                template
+        """
+        from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+        run_turbomind_model(model_path,
+                            session_id=session_id,
+                            cap=cap,
+                            tp=tp,
+                            stream_output=stream_output,
+                            **kwargs)
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
new file mode 100644
index 0000000000..ab15cb46ad
--- /dev/null
+++ b/lmdeploy/cli/cli.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import fire
+
+from .chat import SubCliChat
+from .lite import SubCliLite
+from .serve import SubCliServe
+
+
+class CLI(object):
+    """LMDeploy Command Line Interface.
+
+    The CLI provides a unified API for converting, compressing and deploying
+    large language models.
+    """
+
+    def convert(self,
+                model_name: str,
+                model_path: str,
+                model_format: str = None,
+                tokenizer_path: str = None,
+                dst_path: str = './workspace',
+                tp: int = 1,
+                quant_path: str = None,
+                group_size: int = 0):
+        """Convert LLMs to lmdeploy format.
+
+        Args:
+            model_name (str): The name of the to-be-deployed model, such as
+                llama-7b, llama-13b, vicuna-7b and etc.
+            model_path (str): The directory path of the model
+            model_format (str): the format of the model, should choose from
+                ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+                format, 'hf' means huggingface llama format, and 'awq' means
+                llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+                the default value is None, which means the model_format will be
+                inferred based on model_name
+            tokenizer_path (str): The path of tokenizer model.
+            dst_path (str): The destination path that saves outputs.
+            tp (int): The number of GPUs used for tensor parallelism, which
+                should be 2^n.
+            quant_path (str): Path of the quantized model, which can be None.
+            group_size (int): A parameter used in AWQ to quantize fp16 weights
+                to 4 bits.
+        """
+        from lmdeploy.turbomind.deploy.converter import main as convert
+
+        convert(model_name,
+                model_path,
+                model_format=model_format,
+                tokenizer_path=tokenizer_path,
+                dst_path=dst_path,
+                tp=tp,
+                quant_path=quant_path,
+                group_size=group_size)
+
+    def list(self, engine: str = 'turbomind'):
+        """List supported model names.
+
+        Examples 1:
+            lmdeploy list
+
+        Examples 2:
+            lmdeploy list --engine pytorch
+
+        Args:
+            engine (str): The backend for the model to run. Choice from
+                ['turbomind', 'pytorch'].
+        """
+        assert engine in ['turbomind', 'pytorch']
+        if engine == 'pytorch':
+            model_names = ['llama', 'llama2', 'internlm-7b']
+        elif engine == 'turbomind':
+            from lmdeploy.model import MODELS
+            model_names = list(MODELS.module_dict.keys())
+            model_names = [n for n in model_names if n.lower() not in ['base']]
+        model_names.sort()
+        print('Supported model names:')
+        print('\n'.join(model_names))
+
+    def check_env(self, dump_file: str = None):
+        """Check env information.
+
+        Args:
+            dump_file (str): Output file to save env info.
+        """
+
+        import importlib
+
+        import mmengine
+        from mmengine.utils import get_git_hash
+        from mmengine.utils.dl_utils import collect_env
+
+        from lmdeploy.version import __version__
+
+        env_info = collect_env()
+        env_info['LMDeploy'] = __version__ + '+' + get_git_hash()[:7]
+
+        # remove some unnecessary info
+        remove_reqs = ['MMEngine', 'OpenCV']
+        for req in remove_reqs:
+            if req in env_info:
+                env_info.pop(req)
+
+        # extra important dependencies
+        extra_reqs = ['transformers', 'gradio', 'fastapi', 'pydantic']
+
+        for req in extra_reqs:
+            try:
+                env_info[req] = importlib.import_module(req).__version__
+            except Exception:
+                env_info[req] = 'Not Found'
+
+        # print env info
+        for k, v in env_info.items():
+            print(f'{k}: {v}')
+
+        # dump to local file
+        if dump_file is not None:
+            work_dir, _ = os.path.split(dump_file)
+            if work_dir:
+                os.makedirs(work_dir, exist_ok=True)
+            mmengine.dump(env_info, dump_file)
+
+
+def run():
+    """The entry point of running LMDeploy CLI."""
+
+    cli = CLI()
+    cli.lite = SubCliLite()
+    cli.chat = SubCliChat()
+    cli.serve = SubCliServe()
+
+    fire.Fire(cli, name='lmdeploy')
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
new file mode 100644
index 0000000000..4302765e28
--- /dev/null
+++ b/lmdeploy/cli/lite.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+class SubCliLite(object):
+    """CLI for compressing LLMs."""
+
+    def auto_awq(self,
+                 model: str,
+                 work_dir: str,
+                 w_bits: int = 4,
+                 w_sym: bool = False,
+                 w_group_size: int = 128,
+                 device: str = 'cuda'):
+        """Perform weight quantization using AWQ algorithm.
+
+        Args:
+            model (str): The path of model in hf format.
+            work_dir (str): The working directory to save results.
+            w_bits (int): Bit number for weight quantization.
+            w_sym (bool): Whether to do symmetric quantization.
+            w_group_size (int): Group size for weight quantization statistics.
+            device (str): Device type of running.
+        """
+        from lmdeploy.lite.apis.auto_awq import auto_awq
+
+        auto_awq(model,
+                 work_dir,
+                 w_bits=w_bits,
+                 w_sym=w_sym,
+                 w_group_size=w_group_size,
+                 device=device)
+
+    def calibrate(self,
+                  model: str,
+                  calib_dataset: str = 'c4',
+                  calib_samples: int = 128,
+                  calib_seqlen: int = 2048,
+                  work_dir: str = './work_dir',
+                  device: str = 'cuda') -> None:
+        """Perform calibration on a given dataset.
+
+        Args:
+            model (str): The model to be loaded.
+            calib_dataset (str, optional): The calibration dataset name.
+                Defaults to 'c4'.
+            calib_samples (int, optional): The number of samples for
+                calibration. Defaults to 128.
+            calib_seqlen (int, optional): The sequence length for calibration.
+                Defaults to 2048.
+            work_dir (str): The working directory for outputs.
+                Defaults to './work_dir'.
+            device (str, optional): The device to be used for calculation.
+                Defaults to 'cuda'.
+        """
+        from lmdeploy.lite.apis.calibrate import calibrate
+
+        calibrate(model,
+                  calib_dataset=calib_dataset,
+                  calib_samples=calib_samples,
+                  calib_seqlen=calib_seqlen,
+                  work_dir=work_dir,
+                  device=device)
+
+    def kv_qparams(self,
+                   work_dir: str,
+                   turbomind_dir: str,
+                   kv_bits: int = 8,
+                   kv_sym: bool = False,
+                   num_tp: int = 1) -> None:
+        """Export key and value stats.
+
+        Args:
+            work_dir (str): Directory path where the stats
+                are saved.
+            turbomind_dir (str): Directory path where to
+                save the results.
+            kv_bits (int, optional): Number of bits for quantization.
+                Defaults to 8.
+            kv_sym (bool, optional): Whether to use symmetric quantization.
+                Defaults to False.
+            num_tp (int, optional): Number of tensor parallelism.
+                Defaults to 1.
+        """
+        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+        run_kv_qparams(work_dir,
+                       turbomind_dir,
+                       kv_bits=kv_bits,
+                       kv_sym=kv_sym,
+                       num_tp=num_tp)
+
+    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
+        """Convert a hugging face model to the smallest sharded one.
+
+        Args:
+            src_dir (str): The directory of the input HF model.
+            dst_dir (str): The directory to save new  model.
+        """
+        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+        run_sharded(src_dir, dst_dir)
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
new file mode 100644
index 0000000000..33580cdfe1
--- /dev/null
+++ b/lmdeploy/cli/serve.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+
+class SubCliServe(object):
+    """Serve LLMs and interact on terminal or web UI."""
+
+    def gradio(self,
+               model_path_or_server: str,
+               server_name: str = '0.0.0.0',
+               server_port: int = 6006,
+               batch_size: int = 32,
+               tp: int = 1,
+               restful_api: bool = False):
+        """Serve LLMs with web ui using gradio.
+
+        Example 1:
+            lmdeploy serve gradio ./workspace
+
+        Example 2:
+            lmdeploy serve gradio http://0.0.0.0:23333
+            --server_name 0.0.0.0
+            --server_port 6006
+            --restful_api True
+
+        Example 3:
+            lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+
+        Args:
+            model_path_or_server (str): the path of the deployed model or the
+                tritonserver URL or restful api URL. The former is for directly
+                running service with gradio. The latter is for running with
+                tritonserver by default. If the input URL is restful api.
+                Please enable another flag `restful_api`.
+            server_name (str): the ip address of gradio server
+            server_port (int): the port of gradio server
+            batch_size (int): batch size for running Turbomind directly
+            tp (int): tensor parallel for Turbomind
+            restful_api (bool): a flag for model_path_or_server
+        """
+        from lmdeploy.serve.gradio.app import run
+        run(model_path_or_server,
+            server_name=server_name,
+            server_port=server_port,
+            batch_size=batch_size,
+            tp=tp,
+            restful_api=restful_api)
+
+    def api_server(self,
+                   model_path: str,
+                   server_name: str = '0.0.0.0',
+                   server_port: int = 23333,
+                   instance_num: int = 32,
+                   tp: int = 1,
+                   allow_origins: List[str] = ['*'],
+                   allow_credentials: bool = True,
+                   allow_methods: List[str] = ['*'],
+                   allow_headers: List[str] = ['*']):
+        """Serve LLMs with restful api using fastapi.
+
+        Args:
+            model_path (str): the path of the deployed model
+            server_name (str): host ip for serving
+            server_port (int): server port
+            instance_num (int): number of instances of turbomind model
+            tp (int): tensor parallel
+            allow_origins (List[str]): a list of allowed origins for CORS
+            allow_credentials (bool): whether to allow credentials for CORS
+            allow_methods (List[str]): a list of allowed HTTP methods for CORS
+            allow_headers (List[str]): a list of allowed HTTP headers for CORS
+        """
+        from lmdeploy.serve.openai.api_server import main as run_api_server
+
+        run_api_server(model_path,
+                       server_name=server_name,
+                       server_port=server_port,
+                       instance_num=instance_num,
+                       tp=tp,
+                       allow_origins=allow_origins,
+                       allow_credentials=allow_credentials,
+                       allow_methods=allow_methods,
+                       allow_headers=allow_headers)
+
+    def api_client(self, restful_api_url: str, session_id: int = 0):
+        """Interact with restful api server in terminal.
+
+        Args:
+            restful_api_url: The restful api URL.
+            session_id: The identical id of a session.
+        """
+        from lmdeploy.serve.openai.api_client import main as run_api_client
+        run_api_client(restful_api_url, session_id=session_id)
+
+    def triton_client(self,
+                      tritonserver_addr: str,
+                      session_id: int = 1,
+                      cap: str = 'chat',
+                      stream_output: bool = True,
+                      **kwargs):
+        """Interact with Triton Server using gRPC protocol.
+
+        Args:
+            tritonserver_addr (str): the address in format "ip:port" of
+              triton inference server
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama
+                has the ability among ['completion', 'infill', 'instruct',
+                'python']
+            stream_output (bool): indicator for streaming output or not
+            **kwargs (dict): other arguments for initializing model's
+                chat template
+        """
+
+        from lmdeploy.serve.client import main as run_triton_client
+
+        run_triton_client(
+            tritonserver_addr,
+            session_id=session_id,
+            cap=cap,
+            stream_output=stream_output,
+            **kwargs,
+        )
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 3517f51b85..38f067b563 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,7 +2,6 @@
 
 from pathlib import Path
 
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                         load_checkpoint_in_model)
@@ -16,13 +15,15 @@
 LAYER_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMDecoderLayer',
     'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
+    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
+    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
 NORM_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMRMSNorm',
     'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
+    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
+    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
 }
 
@@ -41,6 +42,9 @@ def auto_awq(model: str,
     hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     checkpoint = hf_config._name_or_path
 
+    # hard code for qwen, other configs do not have the `fp16` attribute.
+    hf_config.fp16 = True
+
     with init_empty_weights():
         # Load model
         model = AutoModelForCausalLM.from_pretrained(model,
@@ -62,11 +66,14 @@ def auto_awq(model: str,
             device_map[name] = 'cpu'
         else:
             device_map[name] = 0
-    load_checkpoint_in_model(model, checkpoint, device_map)
+    load_checkpoint_in_model(model,
+                             checkpoint,
+                             device_map,
+                             dtype=torch.float16)
 
     work_dir = Path(work_dir)
 
-    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean']
+    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():
@@ -81,5 +88,6 @@ def auto_awq(model: str,
 
 
 if __name__ == '__main__':
+    import fire
 
     fire.Fire(auto_awq)
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 38b6429a19..27d631bdad 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from pathlib import Path
+from typing import Union
 
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                         load_checkpoint_in_model)
+from torch import nn
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from lmdeploy.lite.quantization import CalibrationContext
@@ -14,17 +15,90 @@
 LAYER_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMDecoderLayer',
     'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
+    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
+    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
 NORM_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMRMSNorm',
     'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
+    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
+    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
 }
 
 
+def _prepare_for_calibrate(model: nn.Module,
+                           layer_type: Union[str, type],
+                           head_name: str = 'lm_head',
+                           device: str = 'cuda',
+                           prefix: str = '') -> None:
+    """Prepare the model for calibration by moving specific modules to CPU.
+
+    This function goes through each child of a given model and checks whether
+    it is an instance of a certain layer type or has the name equal to
+    `head_name`.
+    If yes, it moves the module to CPU, otherwise to the specified device
+    (default is CUDA).
+
+    If the child contains the target layer type in its sub-modules, the
+    function performs the same operation recursively.
+
+    Parameters
+    ----------
+    model : nn.Module
+        The PyTorch model to prepare for calibration.
+    layer_type : Union[str, Type]
+        The type of the layer to be moved to CPU. Can be either a string of
+        class name or the class type itself.
+    head_name : str, optional
+        The name of the module to be moved to CPU. Default is 'lm_head'.
+    device : str, optional
+        The device to which modules not matching the `layer_type` or
+        `head_name` will be moved. Default is 'cuda'.
+    prefix : str, optional
+        The prefix used when printing the names of the moved modules.
+        Default is ''.
+
+    Raises
+    ------
+    TypeError
+        If `layer_type` is neither a string nor a type.
+    """
+
+    for name, child in model.named_children():
+
+        # Check if the child is an instance of the given layer type
+        if isinstance(layer_type, str):
+            is_layer = type(child).__name__ == layer_type
+        elif isinstance(layer_type, type):
+            is_layer = isinstance(child, layer_type)
+        else:
+            raise TypeError(
+                'layer_type should be a string (class name) or a type')
+
+        # Check if the child contains the target module type
+        contain_layer = len(
+            collect_target_modules(child, layer_type, [head_name]).keys()) > 0
+
+        # Check if the child matches the head name
+        is_head = name == head_name
+
+        mod_name = f'{prefix}.{name}' if prefix else name
+
+        # If the child is either an instance of the layer type or has the
+        # head name, move it to CPU, otherwise move it to the specified device
+        if is_layer or is_head:
+            child.to('cpu')
+            print(f'Move {mod_name} to CPU.')
+        elif contain_layer:
+            _prepare_for_calibrate(child, layer_type, head_name, device,
+                                   mod_name)
+        else:
+            child.to(device)
+            print(f'Move {mod_name} to GPU.')
+
+
 def calibrate(model: str,
               calib_dataset: str = 'c4',
               calib_samples: int = 128,
@@ -55,16 +129,38 @@ def calibrate(model: str,
     tokenizer = AutoTokenizer.from_pretrained(model,
                                               use_fast=False,
                                               trust_remote_code=True)
-    hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model,
+                                           torch_dtype=torch.float16,
+                                           trust_remote_code=True)
     checkpoint = hf_config._name_or_path
 
+    # hard code for qwen, other configs do not have the `fp16` attribute.
+    hf_config.fp16 = True
+
     with init_empty_weights():
         # Load model
         model = AutoModelForCausalLM.from_pretrained(model,
+                                                     config=hf_config,
                                                      torch_dtype=torch.float16,
                                                      trust_remote_code=True)
         model.config.use_cache = False
 
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
+        raise RuntimeError(
+            f'Currently, quantification and calibration of {model_type} are '
+            f'not supported. The supported model types are '
+            f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+
+    if model_type == 'QWenLMHeadModel':
+        try:
+            import flash_attn  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                'When using Qwen, you need to `pip install flash-attn` first, '
+                'otherwise calibration and quantification will not work '
+                'properly.')
+
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     norm_type = NORM_TYPE_MAP[type(model).__name__]
 
@@ -78,7 +174,12 @@ def calibrate(model: str,
             device_map[name] = 'cpu'
         else:
             device_map[name] = 0
-    load_checkpoint_in_model(model, checkpoint, device_map)
+    load_checkpoint_in_model(model,
+                             checkpoint,
+                             device_map,
+                             dtype=torch.float16)
+
+    _prepare_for_calibrate(model, layer_type, 'lm_head', device)
 
     print('Loading calibrate dataset ...')
     calib_loader, _ = get_calib_loaders(calib_dataset,
@@ -107,4 +208,6 @@ def calibrate(model: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(calibrate)
diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
index 7d43078daf..f31fee0299 100644
--- a/lmdeploy/lite/apis/kv_qparams.py
+++ b/lmdeploy/lite/apis/kv_qparams.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 from typing import Union
 
-import fire
 import numpy as np
 import torch
 
@@ -120,5 +119,6 @@ def main(work_dir: str,
 
 
 if __name__ == '__main__':
+    import fire
 
     fire.Fire(main)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index c9811563fd..4dca8b1469 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -18,6 +18,10 @@
     'QWenBlock': {
         'ln_1': ['attn.c_attn'],
         'ln_2': ['mlp.w1', 'mlp.w2']
+    },
+    'DecoderLayer': {
+        'input_layernorm': ['self_attn.W_pack'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
     }
 }
 
@@ -33,6 +37,10 @@
     'QWenBlock': {
         'attn.c_attn': ['attn.c_proj'],
         'mlp.w1': ['mlp.c_proj']
+    },
+    'DecoderLayer': {
+        'self_attn.W_pack': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
@@ -69,7 +77,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
     ln.weight.div_(scales)
@@ -116,10 +124,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
-    # (for qwen) pre_fc is packed QKV, only V needs to scale
+    # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
     if size_pre_fc > size_a and size_pre_fc % size_a == 0 \
             and size_pre_fc // size_a == 3:
 
diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py
index 56cfda8f01..1d01696eb9 100644
--- a/lmdeploy/lite/quantization/weight/quantizer.py
+++ b/lmdeploy/lite/quantization/weight/quantizer.py
@@ -8,7 +8,7 @@
                                  cal_qparams_per_group_absmax,
                                  cal_qparams_per_group_minmax,
                                  cal_qparams_per_tensor_absmax,
-                                 cal_qparams_per_tensor_minmax)
+                                 cal_qparams_per_tensor_minmax, precise_round)
 from lmdeploy.lite.utils.global_avail import GlobalAvailMixin
 
 
@@ -119,8 +119,10 @@ def quant(self,
             torch.Tensor: The fake quantized weight tensor.
         """
 
+        float_w = weight.float()
+
         if qparams is None:
-            qparams = self.calculate_qparams(weight)
+            qparams = self.calculate_qparams(float_w)
 
         scales = qparams.scales
         zero_points = qparams.zero_points
@@ -133,17 +135,18 @@ def quant(self,
         # per group scales shape: [out_c, in_c//group_size, 1]
         if len(scales.shape) > 2:
             # scales shape: [out_c, in_c//group_size, 1]
-            weight = weight.reshape(out_c, scales.shape[1], -1)
+            float_w = float_w.reshape(out_c, scales.shape[1], -1)
 
         if zero_points is None:
             assert self.symmetry
-            real_qweight = (weight / scales).round()
+            real_qweight = (float_w / scales).round()
             fake_qweight = real_qweight * scales
 
         else:
             assert not self.symmetry
 
-            real_qweight = (weight / scales).round() + zero_points
+            real_qweight = precise_round(
+                (float_w - float_w.min(-1, keepdim=True)[0]) / scales)
             fake_qweight = (real_qweight - zero_points) * scales
 
         if len(scales.shape) > 2:
@@ -153,4 +156,4 @@ def quant(self,
         if real:
             return real_qweight.to(torch.int32)
         else:
-            return fake_qweight
+            return fake_qweight.to(weight.dtype)
diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
index c2b56287bd..2561fdb23f 100644
--- a/lmdeploy/lite/utils/__init__.py
+++ b/lmdeploy/lite/utils/__init__.py
@@ -6,7 +6,7 @@
                           cal_qparams_per_group_absmax,
                           cal_qparams_per_group_minmax,
                           cal_qparams_per_tensor_absmax,
-                          cal_qparams_per_tensor_minmax)
+                          cal_qparams_per_tensor_minmax, precise_round)
 from .calib_dataloader import get_calib_loaders
 from .collect import (bimap_name_mod, collect_target_modules,
                       collect_target_weights)
@@ -16,7 +16,7 @@
     'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax',
     'cal_qparams_per_group_absmax', 'cal_qparams_per_group_minmax',
     'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax',
-    'QParams', 'get_calib_loaders', 'collect_target_modules',
+    'QParams', 'get_calib_loaders', 'collect_target_modules', 'precise_round',
     'collect_target_weights', 'GlobalAvailMixin', 'split_decoder_layer_inputs',
     'bimap_name_mod', 'concat_decoder_layer_outputs'
 ]
diff --git a/lmdeploy/lite/utils/cal_qparams.py b/lmdeploy/lite/utils/cal_qparams.py
index a682704a55..569297cdb5 100644
--- a/lmdeploy/lite/utils/cal_qparams.py
+++ b/lmdeploy/lite/utils/cal_qparams.py
@@ -11,16 +11,22 @@ class QParams(NamedTuple):
     zero_points: Optional[torch.Tensor]
 
 
+@torch.no_grad()
+def precise_round(x):
+    return x.sign() * (x.abs() + 0.5).floor()
+
+
 @torch.no_grad()
 def cal_qparams_per_channel_absmax(w: torch.Tensor,
                                    n_bits: int,
                                    return_stats: bool = False) -> QParams:
     """Calculate quantization parameters for each channel using absolute max
     value."""
+    float_w = w.float()
 
-    absmax = w.abs().max(dim=-1, keepdim=True)[0]
+    absmax = float_w.abs().max(dim=-1, keepdim=True)[0]
     q_max = 2**(n_bits - 1) - 1
-    scales = absmax.clamp(min=1e-5).div(q_max)
+    scales = absmax.div(q_max)
 
     if return_stats:
         return QParams(scales=scales, zero_points=None), absmax
@@ -35,14 +41,16 @@ def cal_qparams_per_channel_minmax(w: torch.Tensor,
     """Calculate quantization parameters for each channel using min and max
     values."""
 
-    w_min = w.min(dim=-1, keepdim=True)[0]
-    w_max = w.max(dim=-1, keepdim=True)[0]
+    float_w = w.float()
+
+    w_min = float_w.min(dim=-1, keepdim=True)[0]
+    w_max = float_w.max(dim=-1, keepdim=True)[0]
 
     q_max = 2**n_bits - 1
     scales = (w_max - w_min)
-    scales = scales.clamp_(min=1e-5).div_(q_max)
+    scales = scales.div_(q_max)
 
-    zero_points = (-w_min / scales).round()
+    zero_points = precise_round(-w_min / scales)
 
     if return_stats:
         return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
@@ -63,9 +71,12 @@ def cal_qparams_per_group_absmax(w: torch.Tensor,
         'Input channels should be greater than or equal to group_size.'
     assert inc % group_size == 0, \
         'Input channels should be divisible by group_size.'
-    absmax = w.abs().reshape(outc, -1, group_size).max(dim=-1, keepdim=True)[0]
+
+    float_w = w.float()
+    absmax = float_w.abs().reshape(outc, -1, group_size).max(dim=-1,
+                                                             keepdim=True)[0]
     q_max = 2**(n_bits - 1) - 1
-    scales = absmax.clamp(min=1e-5).div(q_max)
+    scales = absmax.div(q_max)
     if return_stats:
         return QParams(scales=scales, zero_points=None), absmax
     else:
@@ -85,14 +96,16 @@ def cal_qparams_per_group_minmax(w: torch.Tensor,
         'Input channels should be greater than or equal to group_size.'
     assert inc % group_size == 0, \
         'Input channels should be divisible by group_size.'
-    w_group_wise = w.reshape(outc, -1, group_size)
+
+    float_w = w.float()
+    w_group_wise = float_w.reshape(outc, -1, group_size)
     w_min = w_group_wise.min(dim=-1, keepdim=True)[0]
     w_max = w_group_wise.max(dim=-1, keepdim=True)[0]
 
     q_max = 2**n_bits - 1
     scales = (w_max - w_min)
-    scales = scales.clamp_(min=1e-5).div_(q_max)
-    zero_points = (-w_min / scales).round()
+    scales = scales.div_(q_max)
+    zero_points = precise_round(-w_min / scales)
     if return_stats:
         return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
     else:
@@ -106,13 +119,15 @@ def cal_qparams_per_tensor_minmax(w: torch.Tensor,
     """Calculate quantization parameters for the entire tensor using min and
     max values."""
 
-    w_min = w.min()
-    w_max = w.max()
+    float_w = w.float()
+
+    w_min = float_w.min()
+    w_max = float_w.max()
 
     q_max = 2**n_bits - 1
     scales = (w_max - w_min)
     scales = scales.clamp_(min=1e-5).div_(q_max)
-    zero_points = (-w_min / scales).round()
+    zero_points = precise_round(-w_min / scales)
     if return_stats:
         return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
     else:
@@ -125,9 +140,10 @@ def cal_qparams_per_tensor_absmax(w: torch.Tensor,
                                   return_stats: bool = False) -> QParams:
     """Calculate quantization parameters for the entire tensor using absolute
     max value."""
-    absmax = w.abs().max()
+    float_w = w.float()
+    absmax = float_w.abs().max()
     q_max = 2**(n_bits - 1) - 1
-    scales = absmax.clamp(min=1e-5).div(q_max)
+    scales = absmax.div(q_max)
 
     if return_stats:
         return QParams(scales=scales, zero_points=None), absmax
diff --git a/lmdeploy/lite/utils/collect.py b/lmdeploy/lite/utils/collect.py
index 8b2691a4a6..3b66ef6146 100644
--- a/lmdeploy/lite/utils/collect.py
+++ b/lmdeploy/lite/utils/collect.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, List, Tuple, Union
 
-from mmengine.config.lazy import LazyAttr
 from torch import nn
 
 
@@ -22,9 +21,6 @@ def collect_target_modules(model: nn.Module,
         A dictionary mapping from module names to module instances.
     """
 
-    if isinstance(target, LazyAttr):
-        target = target.build()
-
     if not isinstance(target, (type, str)):
         raise TypeError('Target must be a string (name of the module) '
                         'or a type (class of the module)')
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index b3fc86f999..81b8229f6a 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -115,6 +115,7 @@ def update_input_ids(self, input_ids: List[int]):
         return input_ids
 
 
+@MODELS.register_module(name='wizardlM')
 @MODELS.register_module(name='vicuna')
 class Vicuna(BaseModel):
     """Chat template of vicuna model."""
@@ -177,15 +178,16 @@ class InternLMChat7B(BaseModel):
 
     def __init__(
             self,
-            system='<|System|>',
+            system='<|System|>:',
             meta_instruction="""You are an AI assistant whose name is InternLM (书生·浦语).
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
 """,  # noqa: E501
-            user='<|User|>',
-            eoh='',
-            eoa='<eoa>',
-            assistant='<|Bot|>',
+            user='<|User|>:',
+            eoh='\n',
+            eoa='<eoa>\n',
+            eosys='\n',
+            assistant='<|Bot|>:',
             stop_words=['<eoa>'],
             **kwargs):
         super().__init__(**kwargs)
@@ -194,6 +196,7 @@ def __init__(
         self.user = user
         self.eoh = eoh
         self.eoa = eoa
+        self.eosys = eosys
         self.assistant = assistant
         self.stop_words = stop_words
 
@@ -211,12 +214,12 @@ def decorate_prompt(self, prompt, sequence_start=True):
         assert self.capability == 'chat', \
             f'{type(self).__name__} has no capability of {self.capability}'
         if sequence_start:
-            return f'<BOS>{self.system}:{self.meta_instruction}\n' \
-                   f'{self.user}:{prompt}{self.eoh}\n' \
-                   f'{self.assistant}:'
+            return f'<BOS>{self.system}{self.meta_instruction}{self.eosys}' \
+                   f'{self.user}{prompt}{self.eoh}' \
+                   f'{self.assistant}'
         else:
-            return f'\n{self.user}:{prompt}{self.eoh}\n' \
-                   f'{self.assistant}:'
+            return f'\n{self.user}{prompt}{self.eoh}' \
+                   f'{self.assistant}'
 
     def messages2prompt(self, messages, sequence_start=True):
         """Return the prompt that is concatenated with other elements in the
@@ -227,17 +230,19 @@ def messages2prompt(self, messages, sequence_start=True):
         Returns:
             str: the concatenated prompt
         """
+
         if isinstance(messages, str):
             return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.meta_instruction if not system else system
-        ret = f'<BOS>{self.system}:{system}\n'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
-                       f'{assistant}{self.eoa}\n'
-            else:
-                ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
+        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
+        ret = '<BOS>'
+        if self.meta_instruction:
+            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
+
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
+        ret += f'{self.assistant}:'
         return ret
 
 
@@ -386,15 +391,16 @@ def messages2prompt(self, messages, sequence_start=True):
         """
         if isinstance(messages, str):
             return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'<BOS>{system}{self.meta_instruction}{self.eosys}'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'{self.user}{user}{self.eoh}{self.assistant}' \
-                       f'{assistant}{self.eoa}'
-            else:
-                ret += f'{self.user}{user}{self.eoh}{self.assistant}'
+        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
+        ret = '<BOS>'
+        if self.meta_instruction:
+            ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
+        ret += f'{self.assistant}'
         return ret
 
 
@@ -625,6 +631,141 @@ def update_input_ids(self, input_ids: List):
         return input_ids
 
 
+@MODELS.register_module(name='solar')
+class SOLAR(BaseModel):
+    """Chat template of SOLAR model.
+
+    `https://huggingface.co/upstage/SOLAR-0-70b-16bit`
+    """
+
+    def __init__(self,
+                 b_sys='### System:\n',
+                 e_sys='\n\n',
+                 user='### User:\n',
+                 eoh='\n\n',
+                 assistant='### Assistant:\n',
+                 eoa='\n\n',
+                 system='',
+                 session_len=2048,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.b_sys = b_sys
+        self.e_sys = e_sys
+        self.user = user
+        self.eoh = eoh
+        self.assistant = assistant
+        self.eoa = eoa
+        self.system = system
+        self.session_len = session_len
+
+    def decorate_prompt(self, prompt, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            prompt (str): user's input prompt
+            sequence_start (bool): indicator for the first round chat of a
+               session sequence
+        Returns:
+            str: the concatenated prompt
+        """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
+        if sequence_start:
+            return f'{self.b_sys}{self.system}{self.e_sys}' \
+                   f'{self.user}{prompt}{self.eoh}{self.assistant}'
+
+        return f'{self.user}{prompt}{self.eoh}{self.assistant}'
+
+    def messages2prompt(self, messages, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        system, users, assistants = self._translate_messages(messages)
+        system = self.system if not system else system
+        ret = f'{self.b_sys}{system}{self.e_sys}'
+        for i, (user, assistant) in enumerate(zip(users, assistants)):
+            ret += f'{self.user}{user}{self.eoh}{self.assistant}'
+            if assistant:
+                ret += f'{assistant}{self.eoa}'
+        return ret
+
+
+@MODELS.register_module(name='ultracm')
+@MODELS.register_module(name='ultralm')
+class UltraChat(BaseModel):
+    """Template of UltraCM and UltraLM models.
+
+    `https://huggingface.co/openbmb/UltraCM-13b`
+    `https://huggingface.co/openbmb/UltraLM-13b`
+    """
+
+    def __init__(
+            self,
+            system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>""",  # noqa: E501
+            eos='</s>',
+            user='User: ',
+            assistant='Assistant: ',
+            session_len=2048,
+            **kwargs):
+        super().__init__(**kwargs)
+        self.system = system
+        self.eos = eos
+        self.session_len = session_len
+        self.user = user
+        self.assistant = assistant
+
+    def decorate_prompt(self, prompt, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            prompt (str): the input prompt
+            sequence_start (bool): indicator for the first round chat of a
+               session sequence
+        Returns:
+            str: the concatenated prompt
+        """
+        assert self.capability == 'chat', \
+            f'{type(self).__name__} has no capability of {self.capability}'
+        if sequence_start:
+            return f'{self.system}\n{self.user}{prompt}{self.eos}' \
+                   f'\n{self.assistant}'
+
+        return f'\n{self.user}{prompt}{self.eos}' \
+               f'\n{self.assistant}'
+
+    def messages2prompt(self, messages, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template. Only evaluate the last instruction completion pair.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        system, users, assistants = self._translate_messages(messages)
+        system = self.system if not system else system
+        ret = f'{system}'
+        for user, assistant in zip(users, assistants):
+            if assistant:
+                ret += f'\n{self.user}{user}{self.eos}' \
+                       f'\n{self.assistant}{assistant}{self.eos}'
+            else:
+                ret += f'\n{self.user}{user}{self.eos}' \
+                       f'\n{self.assistant}'
+        return ret
+
+
 def main(model_name: str = 'test'):
     assert model_name in MODELS.module_dict.keys(), \
         f"'{model_name}' is not supported. " \
@@ -637,4 +778,5 @@ def main(model_name: str = 'test'):
 
 if __name__ == '__main__':
     import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index c30cf6ffe9..2690480a8c 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -51,7 +51,6 @@
 import logging
 from typing import Optional
 
-import fire
 import torch
 from transformers import GenerationConfig, PreTrainedModel
 
@@ -205,6 +204,8 @@ def main(
 
 
 def cli():
+    import fire
+
     fire.Fire(main)
 
 
diff --git a/lmdeploy/pytorch/modules/linear.py b/lmdeploy/pytorch/modules/linear.py
index bfde0d3d42..218a36407e 100644
--- a/lmdeploy/pytorch/modules/linear.py
+++ b/lmdeploy/pytorch/modules/linear.py
@@ -4,6 +4,11 @@
 import torch
 from torch import nn
 
+try:
+    import awq_inference_engine
+except ModuleNotFoundError:
+    awq_inference_engine = None
+
 
 class WeightOnlyQLinear(nn.Module):
     """This class implements weight only quantization linear.
@@ -18,13 +23,15 @@ class WeightOnlyQLinear(nn.Module):
         bias (Tensor, optional): Defaults to None.
     """
 
-    def __init__(self,
-                 w_bit: int,
-                 symmetry: bool,
-                 group_size: int,
-                 in_features: int,
-                 out_features: int,
-                 bias: Optional[torch.Tensor] = None) -> None:
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: Optional[torch.Tensor] = True,
+        w_bit: int = 4,
+        symmetry: bool = False,
+        group_size: int = 128,
+    ) -> None:
         super().__init__()
 
         if w_bit not in [2, 4, 8]:
@@ -92,8 +99,8 @@ def from_linear(cls: Type['WeightOnlyQLinear'],
         out_features = linear.out_features
         bias = False if linear.bias is None else True
 
-        qlinear = cls(w_bit, symmetry, group_size, in_features, out_features,
-                      bias)
+        qlinear = cls(in_features, out_features, bias, w_bit, symmetry,
+                      group_size)
         qlinear.bias = linear.bias
 
         qparams = quantizer.calculate_qparams(linear.weight)
@@ -124,3 +131,24 @@ def from_linear(cls: Type['WeightOnlyQLinear'],
         qlinear.to('cpu')
 
         return qlinear
+
+    @torch.no_grad()
+    def forward(self, x):
+        if awq_inference_engine is None:
+            raise RuntimeError(
+                'Run the following command to install '
+                'the kernel for 4bit inference\n\n'
+                'git clone https://github.com/mit-han-lab/llm-awq.git\n'
+                'cd awq/kernels\n'
+                'python setup.py install\n')
+        out_shape = x.shape[:-1] + (self.out_features, )
+        inputs = x.reshape(-1, x.shape[-1])
+
+        out = awq_inference_engine.gemm_forward_cuda(inputs.half(),
+                                                     self.qweight,
+                                                     self.scales.half(),
+                                                     self.qzeros,
+                                                     self.group_size)
+        out = out + self.bias if self.bias is not None else out
+
+        return out.reshape(out_shape)
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 9588b00da1..5abae0d97a 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -4,9 +4,7 @@
 import os.path as osp
 import random
 from contextlib import contextmanager
-from typing import Literal, Optional
-
-from lmdeploy.model import MODELS, BaseModel
+from typing import List, Literal, Optional
 
 
 @dataclasses.dataclass
@@ -28,7 +26,7 @@ class AsyncEngine:
         tp (int): tensor parallel
     """
 
-    def __init__(self, model_path, instance_num=32, tp=1) -> None:
+    def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
         from lmdeploy import turbomind as tm
         from lmdeploy.tokenizer import Tokenizer
         tokenizer_model_path = osp.join(model_path, 'triton_models',
@@ -36,18 +34,21 @@ def __init__(self, model_path, instance_num=32, tp=1) -> None:
         tokenizer = Tokenizer(tokenizer_model_path)
         self.tm_model = tm.TurboMind(model_path,
                                      eos_id=tokenizer.eos_token_id,
-                                     tp=tp)
+                                     tp=tp,
+                                     **kwargs)
         self.tokenizer = tokenizer
         self.generators = [
             self.tm_model.create_instance() for i in range(instance_num)
         ]
         self.instance_num = instance_num
-        self.model: BaseModel = MODELS.get(self.tm_model.model_name)()
+        self.model = self.tm_model.model
         self.available = [True] * instance_num
         self.starts = [None] * instance_num
         self.steps = {}
+        self.loop = asyncio.get_event_loop()
 
     def stop_session(self, session_id: int):
+        """Stop a session by a session_id."""
         instance_id = session_id % self.instance_num
         input_ids = self.tokenizer.encode('')
         for outputs in self.generators[instance_id].stream_infer(
@@ -60,8 +61,24 @@ def stop_session(self, session_id: int):
             pass
         self.available[instance_id] = True
 
+    def end_session(self, session_id: int):
+        """Clear a session by a session_id."""
+        instance_id = session_id % self.instance_num
+        input_ids = self.tokenizer.encode('')
+        for outputs in self.generators[instance_id].stream_infer(
+                session_id,
+                input_ids,
+                request_output_len=0,
+                sequence_start=False,
+                sequence_end=True,
+                stop=True):
+            pass
+        self.steps[str(session_id)] = 0
+        self.available[instance_id] = True
+
     @contextmanager
     def safe_run(self, instance_id: int, session_id: Optional[int] = None):
+        """A context manager to make sure server's safe running."""
         self.available[instance_id] = False
         try:
             yield
@@ -82,22 +99,80 @@ async def get_generator(self, instance_id: int, stop: bool = False):
                 await asyncio.sleep(0.1)
         return self.generators[instance_id]
 
+    def batch_infer(self,
+                    prompts: List[str],
+                    request_output_len=512,
+                    top_k=40,
+                    top_p=0.8,
+                    temperature=0.8,
+                    repetition_penalty=1.0,
+                    ignore_eos=False,
+                    do_preprocess=True,
+                    **kwargs):
+        """Inference a batch of prompts.
+
+        Args:
+            prompts (List[str]): a batch of prompts
+            request_output_len (int): output token nums
+            top_k (int): The number of the highest probability vocabulary
+              tokens to keep for top-k-filtering
+            top_p (float): If set to float < 1, only the smallest set of most
+              probable tokens with probabilities that add up to top_p or higher
+            are kept for generation.
+            temperature (float): to modulate the next token probability
+            repetition_penalty (float): The parameter for repetition penalty.
+              1.0 means no penalty
+            ignore_eos (bool): indicator for ignoring eos
+            do_preprocess (bool): whether pre-process the messages.
+        """
+        assert isinstance(prompts, List), 'prompts should be a list'
+        batch_size = len(prompts)
+        outputs = [''] * batch_size
+        generators = []
+        for i, prompt in enumerate(prompts):
+            generators.append(
+                self.generate(prompt,
+                              i,
+                              stream_response=True,
+                              sequence_start=True,
+                              sequence_end=True,
+                              request_output_len=request_output_len,
+                              top_k=top_k,
+                              top_p=top_p,
+                              temperature=temperature,
+                              ignore_eos=ignore_eos,
+                              repetition_penalty=repetition_penalty,
+                              do_preprocess=do_preprocess,
+                              **kwargs))
+
+        async def _inner_call(i, generator):
+            async for out in generator:
+                outputs[i] += out.response
+
+        async def gather():
+            await asyncio.gather(
+                *[_inner_call(i, generators[i]) for i in range(batch_size)])
+
+        self.loop.run_until_complete(gather())
+        return outputs
+
     async def generate(
-        self,
-        messages,
-        session_id,
-        stream_response=True,
-        sequence_start=True,
-        sequence_end=False,
-        step=0,
-        request_output_len=512,
-        stop=False,
-        top_k=40,
-        top_p=0.8,
-        temperature=0.8,
-        repetition_penalty=1.0,
-        ignore_eos=False,
-    ):
+            self,
+            messages,
+            session_id,
+            stream_response=True,
+            sequence_start=True,
+            sequence_end=True,  # no interactive mode by default
+            step=0,
+            request_output_len=512,
+            stop=False,
+            top_k=40,
+            top_p=0.8,
+            temperature=0.8,
+            repetition_penalty=1.0,
+            ignore_eos=False,
+            do_preprocess=True,
+            **kwargs):
         """Generate responses.
 
         Args:
@@ -109,15 +184,16 @@ async def generate(
             sequence_end (bool): indicator for ending a sequence
             step (int): the offset of the k/v cache
             stop (bool): whether stop inference
-            top_p (float): If set to float < 1, only the smallest set of most
-              probable tokens with probabilities that add up to top_p or higher
-            are kept for generation.
             top_k (int): The number of the highest probability vocabulary
               tokens to keep for top-k-filtering
+            top_p (float): If set to float < 1, only the smallest set of most
+              probable tokens with probabilities that add up to top_p or higher
+              are kept for generation.
             temperature (float): to modulate the next token probability
             repetition_penalty (float): The parameter for repetition penalty.
               1.0 means no penalty
             ignore_eos (bool): indicator for ignoring eos
+            do_preprocess (bool): whether pre-process the messages.
         """
         instance_id = session_id % self.instance_num
         if str(session_id) not in self.steps:
@@ -125,14 +201,18 @@ async def generate(
         if step != 0:
             self.steps[str(session_id)] = step
         seed = random.getrandbits(64)
-        prompt = self.model.messages2prompt(messages, sequence_start)
+        prompt = messages
+        if do_preprocess:
+            prompt = self.model.messages2prompt(prompt, sequence_start)
         input_ids = self.tokenizer.encode(prompt)
         finish_reason = 'stop' if stop else None
         if self.steps[str(session_id)] + len(
-                input_ids) >= self.tm_model.session_len:
+                input_ids) + request_output_len >= self.tm_model.session_len:
             finish_reason = 'length'
             yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
                          finish_reason)
+            if sequence_end is True and sequence_start is False:
+                self.end_session(session_id)
         else:
             generator = await self.get_generator(instance_id, stop)
             with self.safe_run(instance_id, session_id):
@@ -156,6 +236,11 @@ async def generate(
                     # decode res
                     response = self.tokenizer.decode(res.tolist(),
                                                      offset=response_size)
+                    # utf-8 char at the end means it's a potential unfinished
+                    # byte sequence, continue to concate it with the next
+                    # sequence and decode them together
+                    if response.endswith('�'):
+                        continue
                     # response, history token len,
                     # input token len, gen token len
                     yield GenOut(response, self.steps[str(session_id)],
@@ -166,93 +251,3 @@ async def generate(
                 self.steps[str(session_id)] += len(input_ids) + tokens
                 if sequence_end or stop:
                     self.steps[str(session_id)] = 0
-
-    async def generate_openai(
-        self,
-        messages,
-        instance_id,
-        stream_response=True,
-        renew_session=False,
-        request_output_len=512,
-        stop=False,
-        top_k=40,
-        top_p=0.8,
-        temperature=0.8,
-        repetition_penalty=1.0,
-        ignore_eos=False,
-    ):
-        """Generate responses.
-
-        Args:
-            messages (str | List): chat history or prompt
-            instance_id (int): actually request host ip
-            stream_response (bool): whether return responses streamingly
-            renew_session (bool): renew the session
-            request_output_len (int): output token nums
-            stop (bool): whether stop inference
-            top_p (float): If set to float < 1, only the smallest set of most
-              probable tokens with probabilities that add up to top_p or higher
-            are kept for generation.
-            top_k (int): The number of the highest probability vocabulary
-              tokens to keep for top-k-filtering
-            temperature (float): to modulate the next token probability
-            repetition_penalty (float): The parameter for repetition penalty.
-              1.0 means no penalty
-            ignore_eos (bool): indicator for ignoring eos
-        """
-        session_id = instance_id
-        instance_id %= self.instance_num
-        sequence_start = False
-        generator = await self.get_generator(instance_id)
-        if renew_session:  # renew a session
-            empty_input_ids = self.tokenizer.encode('')
-            for outputs in generator.stream_infer(session_id=session_id,
-                                                  input_ids=[empty_input_ids],
-                                                  request_output_len=0,
-                                                  sequence_start=False,
-                                                  sequence_end=True,
-                                                  stop=True):
-                pass
-            self.steps[str(session_id)] = 0
-        if str(session_id) not in self.steps:
-            self.steps[str(session_id)] = 0
-        if self.steps[str(session_id)] == 0:
-            sequence_start = True
-        seed = random.getrandbits(64)
-        prompt = self.model.messages2prompt(messages, sequence_start)
-        input_ids = self.tokenizer.encode(prompt)
-        finish_reason = 'stop' if stop else None
-        if self.steps[str(session_id)] + len(
-                input_ids) >= self.tm_model.session_len:
-            finish_reason = 'length'
-            yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
-                         finish_reason)
-        else:
-            with self.safe_run(instance_id, session_id):
-                response_size = 0
-                async for outputs in generator.async_stream_infer(
-                        session_id=session_id,
-                        input_ids=[input_ids],
-                        stream_output=stream_response,
-                        request_output_len=request_output_len,
-                        sequence_start=(sequence_start),
-                        sequence_end=False,
-                        step=self.steps[str(session_id)],
-                        stop=stop,
-                        top_k=top_k,
-                        top_p=top_p,
-                        temperature=temperature,
-                        repetition_penalty=repetition_penalty,
-                        ignore_eos=ignore_eos,
-                        random_seed=seed if sequence_start else None):
-                    res, tokens = outputs[0]
-                    # decode res
-                    response = self.tokenizer.decode(res.tolist(),
-                                                     offset=response_size)
-                    # response, history len, input len, generation len
-                    yield GenOut(response, self.steps[str(session_id)],
-                                 len(input_ids), tokens, finish_reason)
-                    response_size = tokens
-
-                # update step
-                self.steps[str(session_id)] += len(input_ids) + tokens
diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
index 283e96e299..424e83143f 100644
--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -1,8 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 
-import fire
-
 from lmdeploy.serve.turbomind.chatbot import Chatbot
 
 
@@ -20,7 +18,6 @@ def input_prompt(model_name):
 def main(tritonserver_addr: str,
          session_id: int = 1,
          cap: str = 'chat',
-         sys_instruct: str = None,
          stream_output: bool = True,
          **kwargs):
     """An example to communicate with inference server through the command line
@@ -32,13 +29,11 @@ def main(tritonserver_addr: str,
         session_id (int): the identical id of a session
         cap (str): the capability of a model. For example, codellama has
             the ability among ['completion', 'infill', 'instruct', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
         stream_output (bool): indicator for streaming output or not
         **kwargs (dict): other arguments for initializing model's chat template
     """
     log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
-    kwargs.update(capability=cap, system=sys_instruct)
+    kwargs.update(capability=cap)
     chatbot = Chatbot(tritonserver_addr,
                       log_level=log_level,
                       display=stream_output,
@@ -69,4 +64,6 @@ def main(tritonserver_addr: str,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/gradio/__init__.py b/lmdeploy/serve/gradio/__init__.py
index ef101fec61..770138a44d 100644
--- a/lmdeploy/serve/gradio/__init__.py
+++ b/lmdeploy/serve/gradio/__init__.py
@@ -1 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .api_server_backend import run_api_server
+from .triton_server_backend import run_triton_server
+from .turbomind_coupled import run_local
+
+__all__ = ['run_api_server', 'run_triton_server', 'run_local']
diff --git a/lmdeploy/serve/gradio/api_server_backend.py b/lmdeploy/serve/gradio/api_server_backend.py
new file mode 100644
index 0000000000..8dd92fa0fd
--- /dev/null
+++ b/lmdeploy/serve/gradio/api_server_backend.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+from lmdeploy.serve.openai.api_client import (get_model_list,
+                                              get_streaming_response)
+
+
+class InterFace:
+    api_server_url: str = None
+    global_session_id: int = 0
+    lock = Lock()
+
+
+def chat_stream_restful(instruction: str, state_chatbot: Sequence,
+                        cancel_btn: gr.Button, reset_btn: gr.Button,
+                        session_id: int):
+    """Chat with AI assistant.
+
+    Args:
+        instruction (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        session_id (int): the session id
+    """
+    state_chatbot = state_chatbot + [(instruction, None)]
+
+    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+    for response, tokens, finish_reason in get_streaming_response(
+            instruction,
+            f'{InterFace.api_server_url}/v1/chat/interactive',
+            session_id=session_id,
+            request_output_len=512,
+            interactive_mode=True):
+        if finish_reason == 'length':
+            gr.Warning('WARNING: exceed session max length.'
+                       ' Please restart the session by reset button.')
+        if tokens < 0:
+            gr.Warning('WARNING: running on the old session.'
+                       ' Please restart the session by reset button.')
+        if state_chatbot[-1][-1] is None:
+            state_chatbot[-1] = (state_chatbot[-1][0], response)
+        else:
+            state_chatbot[-1] = (state_chatbot[-1][0],
+                                 state_chatbot[-1][1] + response
+                                 )  # piece by piece
+        yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
+                       session_id: int):
+    """reset the session.
+
+    Args:
+        instruction_txtbox (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        session_id (int): the session id
+    """
+    state_chatbot = []
+    # end the session
+    for response, tokens, finish_reason in get_streaming_response(
+            '',
+            f'{InterFace.api_server_url}/v1/chat/interactive',
+            session_id=session_id,
+            request_output_len=0,
+            interactive_mode=False):
+        pass
+
+    return (
+        state_chatbot,
+        state_chatbot,
+        gr.Textbox.update(value=''),
+    )
+
+
+def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
+                        reset_btn: gr.Button, session_id: int):
+    """stop the session.
+
+    Args:
+        instruction_txtbox (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        session_id (int): the session id
+    """
+    yield (state_chatbot, disable_btn, disable_btn)
+    # end the session
+    for out in get_streaming_response(
+            '',
+            f'{InterFace.api_server_url}/v1/chat/interactive',
+            session_id=session_id,
+            request_output_len=0,
+            stop=True):
+        pass
+    time.sleep(0.5)
+    messages = []
+    for qa in state_chatbot:
+        messages.append(dict(role='user', content=qa[0]))
+        if qa[1] is not None:
+            messages.append(dict(role='assistant', content=qa[1]))
+    for out in get_streaming_response(
+            messages,
+            f'{InterFace.api_server_url}/v1/chat/interactive',
+            session_id=session_id,
+            request_output_len=0,
+            interactive_mode=True):
+        pass
+    yield (state_chatbot, disable_btn, enable_btn)
+
+
+def run_api_server(api_server_url: str,
+                   server_name: str = 'localhost',
+                   server_port: int = 6006,
+                   batch_size: int = 32):
+    """chat with AI assistant through web ui.
+
+    Args:
+        api_server_url (str): restufl api url
+        server_name (str): the ip address of gradio server
+        server_port (int): the port of gradio server
+        batch_size (int): batch size for running Turbomind directly
+    """
+    InterFace.api_server_url = api_server_url
+    model_names = get_model_list(f'{api_server_url}/v1/models')
+    model_name = ''
+    if isinstance(model_names, list) and len(model_names) > 0:
+        model_name = model_names[0]
+    else:
+        raise ValueError('gradio can find a suitable model from restful-api')
+
+    with gr.Blocks(css=CSS, theme=THEME) as demo:
+        state_chatbot = gr.State([])
+        state_session_id = gr.State(0)
+
+        with gr.Column(elem_id='container'):
+            gr.Markdown('## LMDeploy Playground')
+
+            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
+            instruction_txtbox = gr.Textbox(
+                placeholder='Please input the instruction',
+                label='Instruction')
+            with gr.Row():
+                cancel_btn = gr.Button(value='Cancel', interactive=False)
+                reset_btn = gr.Button(value='Reset')
+
+        send_event = instruction_txtbox.submit(chat_stream_restful, [
+            instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
+            state_session_id
+        ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+        instruction_txtbox.submit(
+            lambda: gr.Textbox.update(value=''),
+            [],
+            [instruction_txtbox],
+        )
+        cancel_btn.click(
+            cancel_restful_func,
+            [state_chatbot, cancel_btn, reset_btn, state_session_id],
+            [state_chatbot, cancel_btn, reset_btn],
+            cancels=[send_event])
+
+        reset_btn.click(reset_restful_func,
+                        [instruction_txtbox, state_chatbot, state_session_id],
+                        [state_chatbot, chatbot, instruction_txtbox],
+                        cancels=[send_event])
+
+        def init():
+            with InterFace.lock:
+                InterFace.global_session_id += 1
+            new_session_id = InterFace.global_session_id
+            return new_session_id
+
+        demo.load(init, inputs=None, outputs=[state_session_id])
+
+    print(f'server is gonna mount on: http://{server_name}:{server_port}')
+    demo.queue(concurrency_count=batch_size, max_size=100,
+               api_open=True).launch(
+                   max_threads=10,
+                   share=True,
+                   server_port=server_port,
+                   server_name=server_name,
+               )
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 71db7a2749..5b1668224d 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -1,542 +1,41 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
-import threading
-import time
-from functools import partial
-from typing import Sequence
-
-import fire
-import gradio as gr
-
-from lmdeploy.serve.async_engine import AsyncEngine
-from lmdeploy.serve.gradio.css import CSS
-from lmdeploy.serve.openai.api_client import (get_model_list,
-                                              get_streaming_response)
-from lmdeploy.serve.openai.api_server import ip2id
-from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-THEME = gr.themes.Soft(
-    primary_hue=gr.themes.colors.blue,
-    secondary_hue=gr.themes.colors.sky,
-    font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
-
-enable_btn = gr.Button.update(interactive=True)
-disable_btn = gr.Button.update(interactive=False)
-
-
-def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
-                request: gr.Request):
-    """Chat with AI assistant.
-
-    Args:
-        instruction (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        llama_chatbot (Chatbot): the instance of a chatbot
-        request (gr.Request): the request from a user
-        model_name (str): the name of deployed model
-    """
-    instruction = state_chatbot[-1][0]
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-
-    bot_response = llama_chatbot.stream_infer(
-        session_id, instruction, f'{session_id}-{len(state_chatbot)}')
-
-    for status, tokens, _ in bot_response:
-        state_chatbot[-1] = (state_chatbot[-1][0], tokens)
-        yield (state_chatbot, state_chatbot, '')
-
-    return (state_chatbot, state_chatbot, '')
-
-
-def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
-                   llama_chatbot: gr.State, triton_server_addr: str,
-                   model_name: str):
-    """reset the session."""
-    state_chatbot = []
-    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
-    llama_chatbot = Chatbot(triton_server_addr,
-                            model_name,
-                            log_level=log_level,
-                            display=True)
-
-    return (
-        llama_chatbot,
-        state_chatbot,
-        state_chatbot,
-        gr.Textbox.update(value=''),
-    )
-
-
-def cancel_func(
-    instruction_txtbox: gr.Textbox,
-    state_chatbot: gr.State,
-    llama_chatbot: gr.State,
-):
-    """cancel the session."""
-    session_id = llama_chatbot._session.session_id
-    llama_chatbot.cancel(session_id)
-
-    return (
-        llama_chatbot,
-        state_chatbot,
-    )
-
-
-def add_instruction(instruction, state_chatbot):
-    state_chatbot = state_chatbot + [(instruction, None)]
-    return ('', state_chatbot)
-
-
-def run_server(triton_server_addr: str,
-               server_name: str = 'localhost',
-               server_port: int = 6006):
-    """chat with AI assistant through web ui.
-
-    Args:
-        triton_server_addr (str): the communication address of inference server
-        server_name (str): the ip address of gradio server
-        server_port (int): the port of gradio server
-    """
-    with gr.Blocks(css=CSS, theme=THEME) as demo:
-        log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
-        llama_chatbot = gr.State(
-            Chatbot(triton_server_addr, log_level=log_level, display=True))
-        state_chatbot = gr.State([])
-        model_name = llama_chatbot.value.model_name
-        reset_all = partial(reset_all_func,
-                            model_name=model_name,
-                            triton_server_addr=triton_server_addr)
-
-        with gr.Column(elem_id='container'):
-            gr.Markdown('## LMDeploy Playground')
-
-            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
-            instruction_txtbox = gr.Textbox(
-                placeholder='Please input the instruction',
-                label='Instruction')
-            with gr.Row():
-                cancel_btn = gr.Button(value='Cancel')
-                reset_btn = gr.Button(value='Reset')
-
-        send_event = instruction_txtbox.submit(
-            add_instruction, [instruction_txtbox, state_chatbot],
-            [instruction_txtbox, state_chatbot]).then(
-                chat_stream, [state_chatbot, llama_chatbot],
-                [state_chatbot, chatbot])
-
-        cancel_btn.click(cancel_func,
-                         [instruction_txtbox, state_chatbot, llama_chatbot],
-                         [llama_chatbot, chatbot],
-                         cancels=[send_event])
-
-        reset_btn.click(
-            reset_all, [instruction_txtbox, state_chatbot, llama_chatbot],
-            [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
-            cancels=[send_event])
-
-    print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=4, max_size=100, api_open=True).launch(
-        max_threads=10,
-        share=True,
-        server_port=server_port,
-        server_name=server_name,
-    )
-
-
-# a IO interface mananing variables
-class InterFace:
-    async_engine: AsyncEngine = None  # for run_local
-    restful_api_url: str = None  # for run_restful
-
-
-def chat_stream_restful(
-    instruction: str,
-    state_chatbot: Sequence,
-    cancel_btn: gr.Button,
-    reset_btn: gr.Button,
-    request: gr.Request,
-):
-    """Chat with AI assistant.
-
-    Args:
-        instruction (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    bot_summarized_response = ''
-    state_chatbot = state_chatbot + [(instruction, None)]
-
-    yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
-           f'{bot_summarized_response}'.strip())
-
-    for response, tokens, finish_reason in get_streaming_response(
-            instruction,
-            f'{InterFace.restful_api_url}/generate',
-            session_id=session_id,
-            request_output_len=512,
-            sequence_start=(len(state_chatbot) == 1),
-            sequence_end=False):
-        if finish_reason == 'length':
-            gr.Warning('WARNING: exceed session max length.'
-                       ' Please restart the session by reset button.')
-        if tokens < 0:
-            gr.Warning('WARNING: running on the old session.'
-                       ' Please restart the session by reset button.')
-        if state_chatbot[-1][-1] is None:
-            state_chatbot[-1] = (state_chatbot[-1][0], response)
-        else:
-            state_chatbot[-1] = (state_chatbot[-1][0],
-                                 state_chatbot[-1][1] + response
-                                 )  # piece by piece
-        yield (state_chatbot, state_chatbot, enable_btn, disable_btn,
-               f'{bot_summarized_response}'.strip())
-
-    yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
-           f'{bot_summarized_response}'.strip())
-
-
-def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
-                       request: gr.Request):
-    """reset the session.
-
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    state_chatbot = []
-
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    # end the session
-    for response, tokens, finish_reason in get_streaming_response(
-            '',
-            f'{InterFace.restful_api_url}/generate',
-            session_id=session_id,
-            request_output_len=0,
-            sequence_start=False,
-            sequence_end=True):
-        pass
-
-    return (
-        state_chatbot,
-        state_chatbot,
-        gr.Textbox.update(value=''),
-    )
-
-
-def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
-                        reset_btn: gr.Button, request: gr.Request):
-    """stop the session.
-
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    # end the session
-    for out in get_streaming_response('',
-                                      f'{InterFace.restful_api_url}/generate',
-                                      session_id=session_id,
-                                      request_output_len=0,
-                                      sequence_start=False,
-                                      sequence_end=False,
-                                      stop=True):
-        pass
-    time.sleep(0.5)
-    messages = []
-    for qa in state_chatbot:
-        messages.append(dict(role='user', content=qa[0]))
-        if qa[1] is not None:
-            messages.append(dict(role='assistant', content=qa[1]))
-    for out in get_streaming_response(messages,
-                                      f'{InterFace.restful_api_url}/generate',
-                                      session_id=session_id,
-                                      request_output_len=0,
-                                      sequence_start=True,
-                                      sequence_end=False):
-        pass
-    return (state_chatbot, disable_btn, enable_btn)
-
-
-def run_restful(restful_api_url: str,
-                server_name: str = 'localhost',
-                server_port: int = 6006,
-                batch_size: int = 32):
-    """chat with AI assistant through web ui.
-
-    Args:
-        restful_api_url (str): restufl api url
-        server_name (str): the ip address of gradio server
-        server_port (int): the port of gradio server
-        batch_size (int): batch size for running Turbomind directly
-    """
-    InterFace.restful_api_url = restful_api_url
-    model_names = get_model_list(f'{restful_api_url}/v1/models')
-    model_name = ''
-    if isinstance(model_names, list) and len(model_names) > 0:
-        model_name = model_names[0]
-    else:
-        raise ValueError('gradio can find a suitable model from restful-api')
-
-    with gr.Blocks(css=CSS, theme=THEME) as demo:
-        state_chatbot = gr.State([])
-
-        with gr.Column(elem_id='container'):
-            gr.Markdown('## LMDeploy Playground')
-
-            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
-            instruction_txtbox = gr.Textbox(
-                placeholder='Please input the instruction',
-                label='Instruction')
-            with gr.Row():
-                cancel_btn = gr.Button(value='Cancel', interactive=False)
-                reset_btn = gr.Button(value='Reset')
-
-        send_event = instruction_txtbox.submit(
-            chat_stream_restful,
-            [instruction_txtbox, state_chatbot, cancel_btn, reset_btn],
-            [state_chatbot, chatbot, cancel_btn, reset_btn])
-        instruction_txtbox.submit(
-            lambda: gr.Textbox.update(value=''),
-            [],
-            [instruction_txtbox],
-        )
-        cancel_btn.click(cancel_restful_func,
-                         [state_chatbot, cancel_btn, reset_btn],
-                         [state_chatbot, cancel_btn, reset_btn],
-                         cancels=[send_event])
-
-        reset_btn.click(reset_restful_func,
-                        [instruction_txtbox, state_chatbot],
-                        [state_chatbot, chatbot, instruction_txtbox],
-                        cancels=[send_event])
-
-    print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=batch_size, max_size=100,
-               api_open=True).launch(
-                   max_threads=10,
-                   share=True,
-                   server_port=server_port,
-                   server_name=server_name,
-               )
-
-
-async def chat_stream_local(
-    instruction: str,
-    state_chatbot: Sequence,
-    cancel_btn: gr.Button,
-    reset_btn: gr.Button,
-    request: gr.Request,
-):
-    """Chat with AI assistant.
-
-    Args:
-        instruction (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    bot_summarized_response = ''
-    state_chatbot = state_chatbot + [(instruction, None)]
-
-    yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
-           f'{bot_summarized_response}'.strip())
-
-    async for outputs in InterFace.async_engine.generate(
-            instruction,
-            session_id,
-            stream_response=True,
-            sequence_start=(len(state_chatbot) == 1)):
-        response = outputs.response
-        if outputs.finish_reason == 'length':
-            gr.Warning('WARNING: exceed session max length.'
-                       ' Please restart the session by reset button.')
-        if outputs.generate_token_len < 0:
-            gr.Warning('WARNING: running on the old session.'
-                       ' Please restart the session by reset button.')
-        if state_chatbot[-1][-1] is None:
-            state_chatbot[-1] = (state_chatbot[-1][0], response)
-        else:
-            state_chatbot[-1] = (state_chatbot[-1][0],
-                                 state_chatbot[-1][1] + response
-                                 )  # piece by piece
-        yield (state_chatbot, state_chatbot, enable_btn, disable_btn,
-               f'{bot_summarized_response}'.strip())
-
-    yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
-           f'{bot_summarized_response}'.strip())
-
-
-async def reset_local_func(instruction_txtbox: gr.Textbox,
-                           state_chatbot: gr.State, request: gr.Request):
-    """reset the session.
-
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    state_chatbot = []
-
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    # end the session
-    async for out in InterFace.async_engine.generate('',
-                                                     session_id,
-                                                     request_output_len=1,
-                                                     stream_response=True,
-                                                     sequence_start=False,
-                                                     sequence_end=True):
-        pass
-
-    return (
-        state_chatbot,
-        state_chatbot,
-        gr.Textbox.update(value=''),
-    )
-
-
-async def cancel_local_func(state_chatbot: gr.State, cancel_btn: gr.Button,
-                            reset_btn: gr.Button, request: gr.Request):
-    """stop the session.
-
-    Args:
-        instruction_txtbox (str): user's prompt
-        state_chatbot (Sequence): the chatting history
-        request (gr.Request): the request from a user
-    """
-    session_id = threading.current_thread().ident
-    if request is not None:
-        session_id = ip2id(request.kwargs['client']['host'])
-    # end the session
-    async for out in InterFace.async_engine.generate('',
-                                                     session_id,
-                                                     request_output_len=0,
-                                                     stream_response=True,
-                                                     sequence_start=False,
-                                                     sequence_end=False,
-                                                     stop=True):
-        pass
-    messages = []
-    for qa in state_chatbot:
-        messages.append(dict(role='user', content=qa[0]))
-        if qa[1] is not None:
-            messages.append(dict(role='assistant', content=qa[1]))
-    async for out in InterFace.async_engine.generate(messages,
-                                                     session_id,
-                                                     request_output_len=0,
-                                                     stream_response=True,
-                                                     sequence_start=True,
-                                                     sequence_end=False):
-        pass
-    return (state_chatbot, disable_btn, enable_btn)
-
-
-def run_local(model_path: str,
-              server_name: str = 'localhost',
-              server_port: int = 6006,
-              batch_size: int = 4,
-              tp: int = 1):
-    """chat with AI assistant through web ui.
-
-    Args:
-        model_path (str): the path of the deployed model
-        server_name (str): the ip address of gradio server
-        server_port (int): the port of gradio server
-        batch_size (int): batch size for running Turbomind directly
-        tp (int): tensor parallel for Turbomind
-    """
-    InterFace.async_engine = AsyncEngine(model_path=model_path,
-                                         instance_num=batch_size,
-                                         tp=tp)
-
-    with gr.Blocks(css=CSS, theme=THEME) as demo:
-        state_chatbot = gr.State([])
-
-        with gr.Column(elem_id='container'):
-            gr.Markdown('## LMDeploy Playground')
-
-            chatbot = gr.Chatbot(
-                elem_id='chatbot',
-                label=InterFace.async_engine.tm_model.model_name)
-            instruction_txtbox = gr.Textbox(
-                placeholder='Please input the instruction',
-                label='Instruction')
-            with gr.Row():
-                cancel_btn = gr.Button(value='Cancel', interactive=False)
-                reset_btn = gr.Button(value='Reset')
-
-        send_event = instruction_txtbox.submit(
-            chat_stream_local,
-            [instruction_txtbox, state_chatbot, cancel_btn, reset_btn],
-            [state_chatbot, chatbot, cancel_btn, reset_btn])
-        instruction_txtbox.submit(
-            lambda: gr.Textbox.update(value=''),
-            [],
-            [instruction_txtbox],
-        )
-        cancel_btn.click(cancel_local_func,
-                         [state_chatbot, cancel_btn, reset_btn],
-                         [state_chatbot, cancel_btn, reset_btn],
-                         cancels=[send_event])
-
-        reset_btn.click(reset_local_func, [instruction_txtbox, state_chatbot],
-                        [state_chatbot, chatbot, instruction_txtbox],
-                        cancels=[send_event])
-
-    print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=batch_size, max_size=100,
-               api_open=True).launch(
-                   max_threads=10,
-                   share=True,
-                   server_port=server_port,
-                   server_name=server_name,
-               )
 
 
 def run(model_path_or_server: str,
-        server_name: str = 'localhost',
+        server_name: str = '0.0.0.0',
         server_port: int = 6006,
         batch_size: int = 32,
         tp: int = 1,
-        restful_api: bool = False):
+        **kwargs):
     """chat with AI assistant through web ui.
 
     Args:
         model_path_or_server (str): the path of the deployed model or the
-            tritonserver URL or restful api URL. The former is for directly
-            running service with gradio. The latter is for running with
-            tritonserver by default. If the input URL is restful api. Please
-            enable another flag `restful_api`.
+            tritonserver URL or restful api URL. For example:
+            - ./workspace
+            - 0.0.0.0:23333
+            - http://0.0.0.0:23333
         server_name (str): the ip address of gradio server
         server_port (int): the port of gradio server
         batch_size (int): batch size for running Turbomind directly
         tp (int): tensor parallel for Turbomind
-        restufl_api (bool): a flag for model_path_or_server
     """
     if ':' in model_path_or_server:
-        if restful_api:
-            run_restful(model_path_or_server, server_name, server_port,
-                        batch_size)
+        if 'http:' in model_path_or_server:
+            from lmdeploy.serve.gradio.api_server_backend import run_api_server
+            run_api_server(model_path_or_server, server_name, server_port,
+                           batch_size)
         else:
-            run_server(model_path_or_server, server_name, server_port)
+            from lmdeploy.serve.gradio.triton_server_backend import \
+                run_triton_server
+            run_triton_server(model_path_or_server, server_name, server_port)
     else:
+        from lmdeploy.serve.gradio.turbomind_coupled import run_local
         run_local(model_path_or_server, server_name, server_port, batch_size,
                   tp)
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(run)
diff --git a/lmdeploy/serve/gradio/constants.py b/lmdeploy/serve/gradio/constants.py
new file mode 100644
index 0000000000..891c572e5a
--- /dev/null
+++ b/lmdeploy/serve/gradio/constants.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import gradio as gr
+
+CSS = """
+#container {
+    width: 95%;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+#chatbot {
+    height: 500px;
+    overflow: auto;
+}
+
+.chat_wrap_space {
+    margin-left: 0.5em
+}
+"""
+
+THEME = gr.themes.Soft(
+    primary_hue=gr.themes.colors.blue,
+    secondary_hue=gr.themes.colors.sky,
+    font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
+
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
diff --git a/lmdeploy/serve/gradio/css.py b/lmdeploy/serve/gradio/css.py
deleted file mode 100644
index b3bd233222..0000000000
--- a/lmdeploy/serve/gradio/css.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-CSS = """
-#container {
-    width: 95%;
-    margin-left: auto;
-    margin-right: auto;
-}
-
-#chatbot {
-    height: 500px;
-    overflow: auto;
-}
-
-.chat_wrap_space {
-    margin-left: 0.5em
-}
-"""
diff --git a/lmdeploy/serve/gradio/triton_server_backend.py b/lmdeploy/serve/gradio/triton_server_backend.py
new file mode 100644
index 0000000000..9148903cc5
--- /dev/null
+++ b/lmdeploy/serve/gradio/triton_server_backend.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from functools import partial
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+from lmdeploy.serve.turbomind.chatbot import Chatbot
+
+
+class InterFace:
+    global_session_id: int = 0
+    lock = Lock()
+
+
+def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
+                cancel_btn: gr.Button, reset_btn: gr.Button, session_id: int):
+    """Chat with AI assistant.
+
+    Args:
+        instruction (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        llama_chatbot (Chatbot): the instance of a chatbot
+        cancel_btn (bool): enable the cancel button or not
+        reset_btn (bool): enable the reset button or not
+        session_id (int): the session id
+    """
+    instruction = state_chatbot[-1][0]
+
+    bot_response = llama_chatbot.stream_infer(
+        session_id, instruction, f'{session_id}-{len(state_chatbot)}')
+
+    for status, tokens, _ in bot_response:
+        state_chatbot[-1] = (state_chatbot[-1][0], tokens)
+        yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
+                   llama_chatbot: gr.State, triton_server_addr: str,
+                   model_name: str):
+    """reset the session."""
+    state_chatbot = []
+    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+    llama_chatbot = Chatbot(triton_server_addr,
+                            model_name,
+                            log_level=log_level,
+                            display=True)
+
+    return (
+        llama_chatbot,
+        state_chatbot,
+        state_chatbot,
+        gr.Textbox.update(value=''),
+    )
+
+
+def cancel_func(
+    state_chatbot: gr.State,
+    llama_chatbot: gr.State,
+    cancel_btn: gr.Button,
+    reset_btn: gr.Button,
+):
+    """cancel the session."""
+    yield (llama_chatbot, state_chatbot, disable_btn, disable_btn)
+    session_id = llama_chatbot._session.session_id
+    llama_chatbot.cancel(session_id)
+
+    yield (llama_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def add_instruction(instruction, state_chatbot):
+    state_chatbot = state_chatbot + [(instruction, None)]
+    return ('', state_chatbot)
+
+
+def run_triton_server(triton_server_addr: str,
+                      server_name: str = 'localhost',
+                      server_port: int = 6006):
+    """chat with AI assistant through web ui.
+
+    Args:
+        triton_server_addr (str): the communication address of inference server
+        server_name (str): the ip address of gradio server
+        server_port (int): the port of gradio server
+    """
+    with gr.Blocks(css=CSS, theme=THEME) as demo:
+        log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+        llama_chatbot = gr.State(
+            Chatbot(triton_server_addr, log_level=log_level, display=True))
+        state_chatbot = gr.State([])
+        state_session_id = gr.State(0)
+        model_name = llama_chatbot.value.model_name
+        reset_all = partial(reset_all_func,
+                            model_name=model_name,
+                            triton_server_addr=triton_server_addr)
+
+        with gr.Column(elem_id='container'):
+            gr.Markdown('## LMDeploy Playground')
+
+            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
+            instruction_txtbox = gr.Textbox(
+                placeholder='Please input the instruction',
+                label='Instruction')
+            with gr.Row():
+                cancel_btn = gr.Button(value='Cancel', interactive=False)
+                reset_btn = gr.Button(value='Reset')
+
+        send_event = instruction_txtbox.submit(
+            add_instruction, [instruction_txtbox, state_chatbot],
+            [instruction_txtbox, state_chatbot]).then(chat_stream, [
+                state_chatbot, llama_chatbot, cancel_btn, reset_btn,
+                state_session_id
+            ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+
+        cancel_btn.click(cancel_func,
+                         [state_chatbot, llama_chatbot, cancel_btn, reset_btn],
+                         [llama_chatbot, chatbot, cancel_btn, reset_btn],
+                         cancels=[send_event])
+
+        reset_btn.click(
+            reset_all, [instruction_txtbox, state_chatbot, llama_chatbot],
+            [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
+            cancels=[send_event])
+
+        def init():
+            with InterFace.lock:
+                InterFace.global_session_id += 1
+            new_session_id = InterFace.global_session_id
+            return new_session_id
+
+        demo.load(init, inputs=None, outputs=[state_session_id])
+
+    print(f'server is gonna mount on: http://{server_name}:{server_port}')
+    demo.queue(concurrency_count=4, max_size=100, api_open=True).launch(
+        max_threads=10,
+        share=True,
+        server_port=server_port,
+        server_name=server_name,
+    )
diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py
new file mode 100644
index 0000000000..e344abcbda
--- /dev/null
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.async_engine import AsyncEngine
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+
+
+class InterFace:
+    async_engine: AsyncEngine = None
+    global_session_id: int = 0
+    lock = Lock()
+
+
+async def chat_stream_local(
+    instruction: str,
+    state_chatbot: Sequence,
+    cancel_btn: gr.Button,
+    reset_btn: gr.Button,
+    session_id: int,
+):
+    """Chat with AI assistant.
+
+    Args:
+        instruction (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        cancel_btn (gr.Button): the cancel button
+        reset_btn (gr.Button): the reset button
+        session_id (int): the session id
+    """
+    state_chatbot = state_chatbot + [(instruction, None)]
+
+    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+    async for outputs in InterFace.async_engine.generate(
+            instruction,
+            session_id,
+            stream_response=True,
+            sequence_start=(len(state_chatbot) == 1),
+            sequence_end=False):
+        response = outputs.response
+        if outputs.finish_reason == 'length':
+            gr.Warning('WARNING: exceed session max length.'
+                       ' Please restart the session by reset button.')
+        if outputs.generate_token_len < 0:
+            gr.Warning('WARNING: running on the old session.'
+                       ' Please restart the session by reset button.')
+        if state_chatbot[-1][-1] is None:
+            state_chatbot[-1] = (state_chatbot[-1][0], response)
+        else:
+            state_chatbot[-1] = (state_chatbot[-1][0],
+                                 state_chatbot[-1][1] + response
+                                 )  # piece by piece
+        yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+    yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+async def reset_local_func(instruction_txtbox: gr.Textbox,
+                           state_chatbot: Sequence, session_id: int):
+    """reset the session.
+
+    Args:
+        instruction_txtbox (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        session_id (int): the session id
+    """
+    state_chatbot = []
+    # end the session
+    async for out in InterFace.async_engine.generate('',
+                                                     session_id,
+                                                     request_output_len=1,
+                                                     stream_response=True,
+                                                     sequence_start=False,
+                                                     sequence_end=True):
+        pass
+    return (state_chatbot, state_chatbot, gr.Textbox.update(value=''))
+
+
+async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
+                            reset_btn: gr.Button, session_id: int):
+    """stop the session.
+
+    Args:
+        instruction_txtbox (str): user's prompt
+        state_chatbot (Sequence): the chatting history
+        cancel_btn (gr.Button): the cancel button
+        reset_btn (gr.Button): the reset button
+        session_id (int): the session id
+    """
+    yield (state_chatbot, disable_btn, enable_btn)
+    async for out in InterFace.async_engine.generate('',
+                                                     session_id,
+                                                     request_output_len=0,
+                                                     stream_response=True,
+                                                     sequence_start=False,
+                                                     sequence_end=False,
+                                                     stop=True):
+        pass
+    messages = []
+    for qa in state_chatbot:
+        messages.append(dict(role='user', content=qa[0]))
+        if qa[1] is not None:
+            messages.append(dict(role='assistant', content=qa[1]))
+    async for out in InterFace.async_engine.generate(messages,
+                                                     session_id,
+                                                     request_output_len=0,
+                                                     stream_response=True,
+                                                     sequence_start=True,
+                                                     sequence_end=False):
+        pass
+    yield (state_chatbot, disable_btn, enable_btn)
+
+
+def run_local(model_path: str,
+              server_name: str = 'localhost',
+              server_port: int = 6006,
+              batch_size: int = 4,
+              tp: int = 1):
+    """chat with AI assistant through web ui.
+
+    Args:
+        model_path (str): the path of the deployed model
+        server_name (str): the ip address of gradio server
+        server_port (int): the port of gradio server
+        batch_size (int): batch size for running Turbomind directly
+        tp (int): tensor parallel for Turbomind
+    """
+    InterFace.async_engine = AsyncEngine(model_path=model_path,
+                                         instance_num=batch_size,
+                                         tp=tp)
+
+    with gr.Blocks(css=CSS, theme=THEME) as demo:
+        state_chatbot = gr.State([])
+        state_session_id = gr.State(0)
+
+        with gr.Column(elem_id='container'):
+            gr.Markdown('## LMDeploy Playground')
+
+            chatbot = gr.Chatbot(
+                elem_id='chatbot',
+                label=InterFace.async_engine.tm_model.model_name)
+            instruction_txtbox = gr.Textbox(
+                placeholder='Please input the instruction',
+                label='Instruction')
+            with gr.Row():
+                cancel_btn = gr.Button(value='Cancel', interactive=False)
+                reset_btn = gr.Button(value='Reset')
+
+        send_event = instruction_txtbox.submit(chat_stream_local, [
+            instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
+            state_session_id
+        ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+        instruction_txtbox.submit(
+            lambda: gr.Textbox.update(value=''),
+            [],
+            [instruction_txtbox],
+        )
+        cancel_btn.click(
+            cancel_local_func,
+            [state_chatbot, cancel_btn, reset_btn, state_session_id],
+            [state_chatbot, cancel_btn, reset_btn],
+            cancels=[send_event])
+
+        reset_btn.click(reset_local_func,
+                        [instruction_txtbox, state_chatbot, state_session_id],
+                        [state_chatbot, chatbot, instruction_txtbox],
+                        cancels=[send_event])
+
+        def init():
+            with InterFace.lock:
+                InterFace.global_session_id += 1
+            new_session_id = InterFace.global_session_id
+            return new_session_id
+
+        demo.load(init, inputs=None, outputs=[state_session_id])
+
+    print(f'server is gonna mount on: http://{server_name}:{server_port}')
+    demo.queue(concurrency_count=batch_size, max_size=100,
+               api_open=True).launch(
+                   max_threads=10,
+                   share=True,
+                   server_port=server_port,
+                   server_name=server_name,
+               )
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index a8718331be..a1610e05ea 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -1,8 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
-from typing import Iterable, List
+from typing import Any, Dict, Iterable, List, Optional, Union
 
-import fire
 import requests
 
 
@@ -15,13 +14,306 @@ def get_model_list(api_url: str):
     return None
 
 
+class APIClient:
+    """Chatbot for LLaMA series models with turbomind as inference engine.
+
+    Args:
+        api_server_url (str): communicating address 'http://<ip>:<port>' of
+            api_server
+    """
+
+    def __init__(self, api_server_url: str, **kwargs):
+        self.api_server_url = api_server_url
+        self.chat_intractive_v1_url = f'{api_server_url}/v1/chat/interactive'
+        self.chat_completions_v1_url = f'{api_server_url}/v1/chat/completions'
+        self.completions_v1_url = f'{api_server_url}/v1/completions'
+        self.models_v1_url = f'{api_server_url}/v1/models'
+        self._available_models = None
+
+    @property
+    def available_models(self):
+        """Show available models."""
+        if self._available_models is not None:
+            return self._available_models
+        response = requests.get(self.models_v1_url)
+        if hasattr(response, 'text'):
+            model_list = json.loads(response.text)
+            model_list = model_list.pop('data', [])
+            self._available_models = [item['id'] for item in model_list]
+            return self._available_models
+        return None
+
+    def chat_completions_v1(self,
+                            model: str,
+                            messages: Union[str, List[Dict[str, str]]],
+                            temperature: Optional[float] = 0.7,
+                            top_p: Optional[float] = 1.0,
+                            n: Optional[int] = 1,
+                            max_tokens: Optional[int] = 512,
+                            stop: Optional[bool] = False,
+                            stream: Optional[bool] = False,
+                            presence_penalty: Optional[float] = 0.0,
+                            frequency_penalty: Optional[float] = 0.0,
+                            user: Optional[str] = None,
+                            repetition_penalty: Optional[float] = 1.0,
+                            session_id: Optional[int] = -1,
+                            ignore_eos: Optional[bool] = False,
+                            **kwargs):
+        """Chat completion v1.
+
+        Args:
+            model: model name. Available from self.available_models.
+            messages: string prompt or chat history in OpenAI format.
+            temperature (float): to modulate the next token probability
+            top_p (float): If set to float < 1, only the smallest set of most
+                probable tokens with probabilities that add up to top_p or
+                higher are kept for generation.
+            n (int): How many chat completion choices to generate for each
+                input message. Only support one here.
+            stream: whether to stream the results or not. Default to false.
+            max_tokens (int): output token nums
+            repetition_penalty (float): The parameter for repetition penalty.
+                1.0 means no penalty
+            ignore_eos (bool): indicator for ignoring eos
+            session_id (int): if not specified, will set random value
+
+        Yields:
+            json objects in openai formats
+        """
+        pload = {
+            k: v
+            for k, v in locals().copy().items()
+            if k[:2] != '__' and k not in ['self']
+        }
+        headers = {'content-type': 'application/json'}
+        response = requests.post(self.chat_completions_v1_url,
+                                 headers=headers,
+                                 json=pload,
+                                 stream=stream)
+        for chunk in response.iter_lines(chunk_size=8192,
+                                         decode_unicode=False,
+                                         delimiter=b'\n'):
+            if chunk:
+                if stream:
+                    decoded = chunk.decode('utf-8')
+                    if decoded == 'data: [DONE]':
+                        continue
+                    if decoded[:6] == 'data: ':
+                        decoded = decoded[6:]
+                    output = json.loads(decoded)
+                    yield output
+                else:
+                    decoded = chunk.decode('utf-8')
+                    output = json.loads(decoded)
+                    yield output
+
+    def chat_interactive_v1(self,
+                            prompt: Union[str, List[Dict[str, str]]],
+                            session_id: int = -1,
+                            interactive_mode: bool = False,
+                            stream: bool = False,
+                            stop: bool = False,
+                            request_output_len: int = 512,
+                            top_p: float = 0.8,
+                            top_k: int = 40,
+                            temperature: float = 0.8,
+                            repetition_penalty: float = 1.0,
+                            ignore_eos: bool = False,
+                            **kwargs):
+        """Interactive completions.
+
+        - On interactive mode, the chat history is kept on the server. Please
+        set `interactive_mode = True`.
+        - On normal mode, no chat history is kept on the server. Set
+        `interactive_mode = False`.
+
+        Args:
+            prompt: the prompt to use for the generation.
+            session_id: determine which instance will be called.
+                If not specified with a value other than -1, using random value
+                directly.
+            interactive_mode (bool): turn on interactive mode or not. On
+                interactive mode, session history is kept on the server (and
+                vice versa).
+            stream: whether to stream the results or not.
+            stop: whether to stop the session response or not.
+            request_output_len (int): output token nums
+            top_p (float): If set to float < 1, only the smallest set of most
+                probable tokens with probabilities that add up to top_p or
+                higher are kept for generation.
+            top_k (int): The number of the highest probability vocabulary
+                tokens to keep for top-k-filtering
+            temperature (float): to modulate the next token probability
+            repetition_penalty (float): The parameter for repetition penalty.
+                1.0 means no penalty
+            ignore_eos (bool): indicator for ignoring eos
+
+        Yields:
+            json objects consist of text, tokens, finish_reason
+        """
+        pload = {
+            k: v
+            for k, v in locals().copy().items()
+            if k[:2] != '__' and k not in ['self']
+        }
+        headers = {'content-type': 'application/json'}
+        response = requests.post(self.chat_intractive_v1_url,
+                                 headers=headers,
+                                 json=pload,
+                                 stream=stream)
+        for chunk in response.iter_lines(chunk_size=8192,
+                                         decode_unicode=False,
+                                         delimiter=b'\n'):
+            if chunk:
+                decoded = chunk.decode('utf-8')
+                output = json.loads(decoded)
+                yield output
+
+    def completions_v1(
+            self,
+            model: str,
+            prompt: Union[str, List[Any]],
+            suffix: Optional[str] = None,
+            temperature: Optional[float] = 0.7,
+            n: Optional[int] = 1,
+            max_tokens: Optional[int] = 16,
+            stream: Optional[bool] = False,
+            top_p: Optional[float] = 1.0,
+            user: Optional[str] = None,
+            # additional argument of lmdeploy
+            repetition_penalty: Optional[float] = 1.0,
+            session_id: Optional[int] = -1,
+            ignore_eos: Optional[bool] = False,
+            **kwargs):
+        """Chat completion v1.
+
+        Args:
+            model (str): model name. Available from /v1/models.
+            prompt (str): the input prompt.
+            suffix (str): The suffix that comes after a completion of inserted
+                text.
+            max_tokens (int): output token nums
+            temperature (float): to modulate the next token probability
+            top_p (float): If set to float < 1, only the smallest set of most
+                probable tokens with probabilities that add up to top_p or
+                higher are kept for generation.
+            n (int): How many chat completion choices to generate for each
+                input message. Only support one here.
+            stream: whether to stream the results or not. Default to false.
+            repetition_penalty (float): The parameter for repetition penalty.
+                1.0 means no penalty
+            user (str): A unique identifier representing your end-user.
+            ignore_eos (bool): indicator for ignoring eos
+            session_id (int): if not specified, will set random value
+
+        Yields:
+            json objects in openai formats
+        """
+        pload = {
+            k: v
+            for k, v in locals().copy().items()
+            if k[:2] != '__' and k not in ['self']
+        }
+        headers = {'content-type': 'application/json'}
+        response = requests.post(self.completions_v1_url,
+                                 headers=headers,
+                                 json=pload,
+                                 stream=stream)
+        for chunk in response.iter_lines(chunk_size=8192,
+                                         decode_unicode=False,
+                                         delimiter=b'\n'):
+            if chunk:
+                if stream:
+                    decoded = chunk.decode('utf-8')[6:]
+                    if decoded == 'data: [DONE]':
+                        continue
+                    if decoded[:6] == 'data: ':
+                        decoded = decoded[6:]
+                    output = json.loads(decoded)
+                    yield output
+                else:
+                    decoded = chunk.decode('utf-8')
+                    output = json.loads(decoded)
+                    yield output
+
+    def chat(self,
+             prompt: str,
+             session_id: int,
+             request_output_len: int = 512,
+             stream: bool = False,
+             top_p: float = 0.8,
+             top_k: int = 40,
+             temperature: float = 0.8,
+             repetition_penalty: float = 1.0,
+             ignore_eos: bool = False):
+        """Chat with a unique session_id.
+
+        Args:
+            prompt: the prompt to use for the generation.
+            session_id: determine which instance will be called.
+                If not specified with a value other than -1, using random value
+                directly.
+            stream: whether to stream the results or not.
+            stop: whether to stop the session response or not.
+            request_output_len (int): output token nums
+            top_p (float): If set to float < 1, only the smallest set of most
+                probable tokens with probabilities that add up to top_p or
+                higher are kept for generation.
+            top_k (int): The number of the highest probability vocabulary
+                tokens to keep for top-k-filtering
+            temperature (float): to modulate the next token probability
+            repetition_penalty (float): The parameter for repetition penalty.
+                1.0 means no penalty
+            ignore_eos (bool): indicator for ignoring eos
+
+        Yields:
+            text, tokens, finish_reason
+        """
+        assert session_id != -1, 'please set a value other than -1'
+        for outputs in self.chat_interactive_v1(
+                prompt,
+                session_id=session_id,
+                request_output_len=request_output_len,
+                interactive_mode=True,
+                stream=stream,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                ignore_eos=ignore_eos):
+            if outputs['finish_reason'] == 'length':
+                print('WARNING: exceed session max length.'
+                      ' Please end the session.')
+            yield outputs['text'], outputs['tokens'], outputs['finish_reason']
+
+    def end_session(self, session_id: int):
+        """End the session with a unique session_id.
+
+        Args:
+            session_id: determine which instance will be called.
+                If not specified with a value other than -1, using random value
+                directly.
+        """
+        for out in self.chat_interactive_v1(prompt='',
+                                            session_id=session_id,
+                                            request_output_len=0,
+                                            interactive_mode=False):
+            pass
+
+
+def input_prompt():
+    """Input a prompt in the consolo interface."""
+    print('\ndouble enter to end input >>> ', end='')
+    sentinel = ''  # ends when this string is seen
+    return '\n'.join(iter(input, sentinel))
+
+
 def get_streaming_response(prompt: str,
                            api_url: str,
                            session_id: int,
                            request_output_len: int = 512,
                            stream: bool = True,
-                           sequence_start: bool = True,
-                           sequence_end: bool = True,
+                           interactive_mode: bool = False,
                            ignore_eos: bool = False,
                            stop: bool = False) -> Iterable[List[str]]:
     headers = {'User-Agent': 'Test Client'}
@@ -30,8 +322,7 @@ def get_streaming_response(prompt: str,
         'stream': stream,
         'session_id': session_id,
         'request_output_len': request_output_len,
-        'sequence_start': sequence_start,
-        'sequence_end': sequence_end,
+        'interactive_mode': interactive_mode,
         'ignore_eos': ignore_eos,
         'stop': stop
     }
@@ -50,43 +341,26 @@ def get_streaming_response(prompt: str,
             yield output, tokens, finish_reason
 
 
-def input_prompt():
-    """Input a prompt in the consolo interface."""
-    print('\ndouble enter to end input >>> ', end='')
-    sentinel = ''  # ends when this string is seen
-    return '\n'.join(iter(input, sentinel))
-
-
-def main(restful_api_url: str, session_id: int = 0):
-    nth_round = 1
+def main(api_server_url: str, session_id: int = 0):
+    api_client = APIClient(api_server_url)
     while True:
         prompt = input_prompt()
-        if prompt == 'exit':
-            for output, tokens, finish_reason in get_streaming_response(
-                    '',
-                    f'{restful_api_url}/generate',
-                    session_id=session_id,
-                    request_output_len=0,
-                    sequence_start=(nth_round == 1),
-                    sequence_end=True):
-                pass
-            exit(0)
+        if prompt in ['exit', 'end']:
+            api_client.end_session(session_id)
+            if prompt == 'exit':
+                exit(0)
         else:
-            for output, tokens, finish_reason in get_streaming_response(
+            for text, tokens, finish_reason in api_client.chat(
                     prompt,
-                    f'{restful_api_url}/generate',
                     session_id=session_id,
                     request_output_len=512,
-                    sequence_start=(nth_round == 1),
-                    sequence_end=False):
+                    stream=True):
                 if finish_reason == 'length':
-                    print('WARNING: exceed session max length.'
-                          ' Please end the session.')
                     continue
-                print(output, end='')
-
-            nth_round += 1
+                print(text, end='')
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 94271c4b9b..97e5e518c9 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
 import os
+import random
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, List, Optional
 
-import fire
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
@@ -14,8 +15,10 @@
 from lmdeploy.serve.openai.protocol import (  # noqa: E501
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingsRequest,
-    EmbeddingsResponse, ErrorResponse, GenerateRequest, GenerateResponse,
+    ChatCompletionStreamResponse, ChatMessage, CompletionRequest,
+    CompletionResponse, CompletionResponseChoice,
+    CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
+    EmbeddingsRequest, ErrorResponse, GenerateRequest, GenerateResponse,
     ModelCard, ModelList, ModelPermission, UsageInfo)
 
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
@@ -105,9 +108,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
         1.0 means no penalty
 
     Additional arguments supported by LMDeploy:
-    - renew_session (bool): Whether renew the session. Can be used when the
-        session length is exceeded.
     - ignore_eos (bool): indicator for ignoring eos
+    - session_id (int): if not specified, will set random value
 
     Currently we do not support the following features:
     - function_call (Users should implement this by themselves)
@@ -115,20 +117,22 @@ async def chat_completions_v1(request: ChatCompletionRequest,
     - presence_penalty (replaced with repetition_penalty)
     - frequency_penalty (replaced with repetition_penalty)
     """
-    session_id = ip2id(raw_request.client.host)
+    if request.session_id == -1:
+        request.session_id = random.randint(1, 10086)
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
 
     model_name = request.model
-    request_id = str(session_id)
+    request_id = str(request.session_id)
     created_time = int(time.time())
 
-    result_generator = VariableInterface.async_engine.generate_openai(
+    result_generator = VariableInterface.async_engine.generate(
         request.messages,
-        session_id,
+        request.session_id,
         True,  # always use stream to enable batching
-        request.renew_session,
+        sequence_start=True,
+        sequence_end=True,
         request_output_len=request.max_tokens if request.max_tokens else 512,
         stop=request.stop,
         top_p=request.top_p,
@@ -189,7 +193,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
     async for res in result_generator:
         if await raw_request.is_disconnected():
             # Abort the request if the client disconnects.
-            VariableInterface.async_engine.stop_session(session_id)
+            VariableInterface.async_engine.stop_session(request.session_id)
             return create_error_response(HTTPStatus.BAD_REQUEST,
                                          'Client disconnected')
         final_res = res
@@ -223,43 +227,191 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
     return response
 
 
-@app.post('/v1/embeddings')
-async def create_embeddings(request: EmbeddingsRequest,
-                            raw_request: Request = None):
-    """Creates embeddings for the text."""
+@app.post('/v1/completions')
+async def completions_v1(request: CompletionRequest,
+                         raw_request: Request = None):
+    """Completion API similar to OpenAI's API.
+
+    Go to `https://platform.openai.com/docs/api-reference/completions/create`
+    for the API specification.
+
+    The request should be a JSON object with the following fields:
+    - model (str): model name. Available from /v1/models.
+    - prompt (str): the input prompt.
+    - suffix (str): The suffix that comes after a completion of inserted text.
+    - max_tokens (int): output token nums
+    - temperature (float): to modulate the next token probability
+    - top_p (float): If set to float < 1, only the smallest set of most
+        probable tokens with probabilities that add up to top_p or higher
+        are kept for generation.
+    - n (int): How many chat completion choices to generate for each input
+        message. Only support one here.
+    - stream: whether to stream the results or not. Default to false.
+    - repetition_penalty (float): The parameter for repetition penalty.
+        1.0 means no penalty
+    - user (str): A unique identifier representing your end-user.
+
+    Additional arguments supported by LMDeploy:
+    - ignore_eos (bool): indicator for ignoring eos
+    - session_id (int): if not specified, will set random value
+
+    Currently we do not support the following features:
+    - logprobs (not supported yet)
+    - presence_penalty (replaced with repetition_penalty)
+    - frequency_penalty (replaced with repetition_penalty)
+    """
+    if request.session_id == -1:
+        request.session_id = random.randint(1, 10086)
     error_check_ret = await check_request(request)
     if error_check_ret is not None:
         return error_check_ret
 
-    embedding = await VariableInterface.async_engine.get_embeddings(
-        request.input)
-    data = [{'object': 'embedding', 'embedding': embedding, 'index': 0}]
-    token_num = len(embedding)
-    return EmbeddingsResponse(
-        data=data,
-        model=request.model,
-        usage=UsageInfo(
-            prompt_tokens=token_num,
-            total_tokens=token_num,
-            completion_tokens=None,
-        ),
-    ).dict(exclude_none=True)
-
-
-@app.post('/generate')
-async def generate(request: GenerateRequest, raw_request: Request = None):
+    model_name = request.model
+    request_id = str(request.session_id)
+    created_time = int(time.time())
+    if isinstance(request.prompt, str):
+        request.prompt = [request.prompt]
+    generators = []
+    for i in range(len(request.prompt)):
+        result_generator = VariableInterface.async_engine.generate(
+            request.prompt[i],
+            request.session_id + i,
+            True,  # always use stream to enable batching
+            sequence_start=True,
+            sequence_end=True,
+            request_output_len=request.max_tokens
+            if request.max_tokens else 512,
+            stop=False,
+            top_p=request.top_p,
+            temperature=request.temperature,
+            repetition_penalty=request.repetition_penalty,
+            ignore_eos=request.ignore_eos,
+            do_preprocess=False)
+        generators.append(result_generator)
+
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        finish_reason: Optional[str] = None,
+    ) -> str:
+        choice_data = CompletionResponseStreamChoice(
+            index=index,
+            text=text,
+            finish_reason=finish_reason,
+        )
+        response = CompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        response_json = response.model_dump_json()
+
+        return response_json
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        # First chunk with role
+        for generator in generators:
+            for i in range(request.n):
+                choice_data = CompletionResponseStreamChoice(
+                    index=i,
+                    text='',
+                    finish_reason=None,
+                )
+                chunk = CompletionStreamResponse(id=request_id,
+                                                 choices=[choice_data],
+                                                 model=model_name)
+                data = chunk.model_dump_json(exclude_unset=True)
+                yield f'data: {data}\n\n'
+
+            async for res in generator:
+                response_json = create_stream_response_json(
+                    index=0,
+                    text=res.response,
+                )
+                yield f'data: {response_json}\n\n'
+        yield 'data: [DONE]\n\n'
+
+    # Streaming response
+    if request.stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type='text/event-stream')
+
+    # Non-streaming response
+    usage = UsageInfo()
+    choices = []
+
+    async def _inner_call(i, generator):
+        final_res = None
+        text = ''
+        async for res in generator:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                VariableInterface.async_engine.stop_session(request.session_id)
+                return create_error_response(HTTPStatus.BAD_REQUEST,
+                                             'Client disconnected')
+            final_res = res
+            text += res.response
+        assert final_res is not None
+        choice_data = CompletionResponseChoice(
+            index=0,
+            text=text,
+            finish_reason=final_res.finish_reason,
+        )
+        choices.append(choice_data)
+
+        total_tokens = sum([
+            final_res.history_token_len, final_res.input_token_len,
+            final_res.generate_token_len
+        ])
+        usage.prompt_tokens += final_res.input_token_len
+        usage.completion_tokens += final_res.generate_token_len
+        usage.total_tokens += total_tokens
+
+    await asyncio.gather(
+        *[_inner_call(i, generators[i]) for i in range(len(generators))])
+
+    response = CompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+
+    return response
+
+
+@app.post('/v1/embeddings', tags=['unsupported'])
+async def create_embeddings(request: EmbeddingsRequest,
+                            raw_request: Request = None):
+    """Creates embeddings for the text."""
+    return create_error_response(HTTPStatus.BAD_REQUEST,
+                                 'Unsupported by turbomind.')
+
+
+@app.post('/generate',
+          tags=['deprecated'],
+          description='please use /v1/chat/interactive')
+@app.post('/v1/chat/interactive')
+async def chat_interactive_v1(request: GenerateRequest,
+                              raw_request: Request = None):
     """Generate completion for the request.
 
+    - On interactive mode, the chat history is kept on the server. Please set
+    `interactive_mode = True`.
+    - On normal mode, no chat history is kept on the server. Set
+    `interactive_mode = False`.
+
     The request should be a JSON object with the following fields:
     - prompt: the prompt to use for the generation.
     - session_id: determine which instance will be called. If not specified
-        with a value other than -1, using host ip directly.
-    - sequence_start (bool): indicator for starting a sequence.
-    - sequence_end (bool): indicator for ending a sequence
+        with a value other than -1, using random value directly.
+    - interactive_mode (bool): turn on interactive mode or not. On interactive
+        mode, session history is kept on the server (and vice versa).
     - stream: whether to stream the results or not.
     - stop: whether to stop the session response or not.
     - request_output_len (int): output token nums
-    - step (int): the offset of the k/v cache
     - top_p (float): If set to float < 1, only the smallest set of most
         probable tokens with probabilities that add up to top_p or higher
         are kept for generation.
@@ -271,15 +423,18 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
     - ignore_eos (bool): indicator for ignoring eos
     """
     if request.session_id == -1:
-        session_id = ip2id(raw_request.client.host)
-        request.session_id = session_id
+        request.session_id = random.randint(10087, 23333)
 
-    generation = VariableInterface.async_engine.generate(
+    async_engine = VariableInterface.async_engine
+    sequence_start = async_engine.steps.get(str(request.session_id), 0) == 0
+    sequence_end = not request.interactive_mode
+
+    generation = async_engine.generate(
         request.prompt,
         request.session_id,
         stream_response=True,  # always use stream to enable batching
-        sequence_start=request.sequence_start,
-        sequence_end=request.sequence_end,
+        sequence_start=sequence_start,
+        sequence_end=sequence_end,
         request_output_len=request.request_output_len,
         top_p=request.top_p,
         top_k=request.top_k,
@@ -308,7 +463,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         async for out in generation:
             if await raw_request.is_disconnected():
                 # Abort the request if the client disconnects.
-                VariableInterface.async_engine.stop_session(session_id)
+                async_engine.stop_session(request.session_id)
                 return create_error_response(HTTPStatus.BAD_REQUEST,
                                              'Client disconnected')
             text += out.response
@@ -319,14 +474,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
 
 def main(model_path: str,
-         server_name: str = 'localhost',
+         server_name: str = '0.0.0.0',
          server_port: int = 23333,
          instance_num: int = 32,
          tp: int = 1,
          allow_origins: List[str] = ['*'],
          allow_credentials: bool = True,
          allow_methods: List[str] = ['*'],
-         allow_headers: List[str] = ['*']):
+         allow_headers: List[str] = ['*'],
+         **kwargs):
     """An example to perform model inference through the command line
     interface.
 
@@ -352,9 +508,12 @@ def main(model_path: str,
 
     VariableInterface.async_engine = AsyncEngine(model_path=model_path,
                                                  instance_num=instance_num,
-                                                 tp=tp)
+                                                 tp=tp,
+                                                 **kwargs)
     uvicorn.run(app=app, host=server_name, port=server_port, log_level='info')
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 756af1a4ca..bee2e2c91c 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -70,7 +70,7 @@ class ChatCompletionRequest(BaseModel):
     user: Optional[str] = None
     # additional argument of lmdeploy
     repetition_penalty: Optional[float] = 1.0
-    renew_session: Optional[bool] = False
+    session_id: Optional[int] = -1
     ignore_eos: Optional[bool] = False
 
 
@@ -135,6 +135,10 @@ class CompletionRequest(BaseModel):
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     user: Optional[str] = None
+    # additional argument of lmdeploy
+    repetition_penalty: Optional[float] = 1.0
+    session_id: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
 
 
 class CompletionResponseChoice(BaseModel):
@@ -175,7 +179,7 @@ class CompletionStreamResponse(BaseModel):
 class EmbeddingsRequest(BaseModel):
     """Embedding request."""
     model: str = None
-    input: Union[str, List[Any]]
+    input: Union[str, List[str]]
     user: Optional[str] = None
 
 
@@ -191,8 +195,7 @@ class GenerateRequest(BaseModel):
     """Generate request."""
     prompt: Union[str, List[Dict[str, str]]]
     session_id: int = -1
-    sequence_start: bool = True
-    sequence_end: bool = False
+    interactive_mode: bool = False
     stream: bool = False
     stop: bool = False
     request_output_len: int = 512
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index cc12fcff3b..5b89cc506a 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -459,6 +459,10 @@ def _stream_infer(self,
             session.sequence_length = 0
 
         input_ids, input_lengths = self.preprocess(prompt)
+        # will crash if last_token_id == eos_id and send empty input_ids
+        if sequence_end and request_output_len == 0:
+            input_ids = np.array([[self.bos_id]], dtype=np.uint32)
+            input_lengths = np.array([[1]], dtype=np.uint32)
         input_tokens = input_lengths.squeeze()
         if self.profile_generation:
             yield StatusCode.TRITON_STREAM_ING, \
@@ -657,8 +661,13 @@ def stream_consumer(postprocess, res_queue, session, n_input_token,
                     continue
                 output_str = postprocess(
                     output_ids, np.array([[n_token]], dtype=np.uint32))
-                n_token = output_ids.shape[-1]
                 text = output_str[0].decode()
+                # utf-8 char at the end means it's a potential unfinished
+                # byte sequence, continue to concate it with the next
+                # sequence and decode them together
+                if text.endswith('�'):
+                    continue
+                n_token = output_ids.shape[-1]
                 if display:
                     print(text, end='', flush=True)
                 session.response += text
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
deleted file mode 100644
index cc8db88f5c..0000000000
--- a/lmdeploy/serve/turbomind/deploy.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import configparser
-import json
-import os
-import os.path as osp
-import re
-import shutil
-import sys
-from pathlib import Path
-
-import fire
-import safetensors
-import torch
-from safetensors.torch import load_file
-from sentencepiece import SentencePieceProcessor
-
-import lmdeploy
-from lmdeploy.model import MODELS
-
-supported_formats = ['llama', 'hf', 'awq', 'qwen']
-
-
-def get_package_root_path():
-    import lmdeploy
-    return Path(lmdeploy.__file__).parent
-
-
-def create_workspace(_path: str):
-    """Create a workspace.
-
-    Args:
-        _path (str): the path of the workspace
-    Returns:
-        bool: success or not
-    """
-    try:
-        if osp.exists(_path):
-            shutil.rmtree(_path)
-        os.makedirs(_path)
-        print(f'create workspace in directory {_path}')
-        return True
-    except Exception as e:
-        print(f'create workspace in {_path} failed: {e}')
-        return False
-
-
-def destroy_workspace(_path: str):
-    """destroy workspace.
-
-    Args:
-        _path(str): the path of the workspace
-    Returns:
-        bool: success or not
-    """
-    try:
-        shutil.rmtree(_path)
-        print(f'destroy workspace in directory {_path}')
-        return True
-    except Exception as e:
-        print(f'destroy workspace in {_path} failed: {e}')
-        return False
-
-
-def copy_triton_model_templates(_path: str):
-    """copy triton model templates to the specified path.
-
-    Args:
-        _path (str): the target path
-    Returns:
-        str: the path of the triton models
-    """
-    try:
-        cur_path = osp.abspath(__file__)
-        dir_path = osp.dirname(cur_path)
-        triton_models_path = osp.join(dir_path, 'triton_models')
-        dst_path = osp.join(_path, 'triton_models')
-        shutil.copytree(triton_models_path, dst_path, symlinks=True)
-        print(f'copy triton model templates from "{triton_models_path}" to '
-              f'"{dst_path}" successfully')
-        shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
-        return dst_path
-    except Exception as e:
-        print(f'copy triton model templates from "{triton_models_path}"'
-              f' to "{dst_path}" failed: {e}')
-        return None
-
-
-def tokenizer_info_sp(model_path: str):
-    """Return the vocabulary size, bos token id and eos token id.
-
-    Args:
-        model_path (str): the tokenizer model's path
-    Returns:
-        tuple: vocabulary size, bos token id and eos token id
-    """
-    assert os.path.isfile(model_path), model_path
-    sp_model = SentencePieceProcessor(model_file=model_path)
-    # BOS / EOS token IDs
-    n_words = sp_model.vocab_size()
-    bos_id = sp_model.bos_id()
-    eos_id = sp_model.eos_id()
-    return n_words, bos_id, eos_id
-
-
-def tokenizer_info_qwen(model_dir: str):
-    n_words = 151851
-    bos_id = 0
-    eos_id = 151643
-    return n_words, bos_id, eos_id
-
-
-def load_checkpoint(model_path):
-    """Load checkpoint files into torch format.
-
-    Args:
-        model_path (str): the checkpoint folder
-    Returns:
-        Dict[str, torch.Tensor]: weight in torch format
-    """
-    suffixes = ['.safetensors', '.bin']
-    for suffix in suffixes:
-        files = [
-            file for file in os.listdir(model_path) if file.endswith(suffix)
-        ]
-        if len(files) > 0:
-            break
-
-    assert len(files) > 0, f'could not find checkpoints in {model_path}'
-    files = sorted(files)
-    print(files)
-    params = {}
-    for file in files:
-        if file.endswith('.bin'):
-            tmp = torch.load(osp.join(model_path, file), map_location='cpu')
-        else:
-            tmp = load_file(osp.join(model_path, file))
-        params.update(tmp)
-    return params
-
-
-def export(model_name: str,
-           num_layer: int,
-           norm_eps: float,
-           kv_head_num: int,
-           model_params: dict,
-           tokenizer_path: str,
-           out_dir: str,
-           tp: int,
-           size_per_head: int = 128,
-           group_size: int = 0,
-           weight_type: str = 'fp16',
-           max_position_embeddings: int = 0,
-           use_dynamic_ntk: int = 0,
-           use_logn_attn: int = 0,
-           rope_theta: float = 10000.0,
-           tokenizer_info=tokenizer_info_sp):
-    """Export deploying information to a config file.
-
-    Args:
-        model_name (str): model's name
-        num_layer (int): the number of transformer blocks
-        norm_eps (float): norm epsilon
-        model_params (dict): parameters of a model
-        tokenizer_path (str): the tokenizer model's path
-        out_dir (str): the path of the output directory
-        tp (int): the number of tensor parallelism
-        size_per_head (int): the dimension of each head
-    """
-    out_dir = osp.join(out_dir, 'weights')
-    os.makedirs(out_dir, exist_ok=True)
-
-    def save_bin(param: torch.Tensor, name):
-        print(name, param.shape)
-        if param.dtype in [torch.float, torch.bfloat16]:
-            param = param.half()
-        param.contiguous().cpu().numpy().tofile(osp.join(out_dir, name))
-
-    attn_bias = False
-    inter_size = 0
-
-    tok_embeddings = model_params['tok_embeddings.weight']
-    _vocab_size, dim = tok_embeddings.shape
-    head_num = dim // size_per_head
-    if _vocab_size % tp != 0:
-        # Resolve https://github.com/InternLM/lmdeploy/issues/266
-        # Pad tok_embeddings and output weights, making their shape divisible by TP # noqa: E501
-        pad_size = (_vocab_size + tp - 1) // tp * tp - _vocab_size
-        # Pad weight at the bottom of dim 0
-        model_params['tok_embeddings.weight'] = torch.nn.functional.pad(
-            tok_embeddings, (0, 0, 0, pad_size), 'constant', 0)
-        # Pad output weight at the bottom of dim 0
-        model_params['output.weight'] = torch.nn.functional.pad(
-            model_params['output.weight'], (0, 0, 0, pad_size), 'constant', 0)
-
-    # reverse the splitting axes since the weights are transposed above
-    for param_name, param_data in model_params.items():
-        split_dim = None
-        key, ext = param_name.split('.')[-2:]
-        if key == 'w_qkv' and ext == 'bias':
-            attn_bias = True
-        copy = False
-        if key in ['w1', 'w3', 'w13', 'w_qkv']:
-            split_dim = -1
-            # TODO: move parameter extraction outside of the loop
-            if key == 'w1':
-                inter_size = max(inter_size, param_data.shape[-1])
-            elif key == 'w13':
-                inter_size = max(inter_size, param_data.shape[-1] // 2)
-        elif key in ['w2', 'wo']:
-            if ext in ['bias']:
-                copy = True
-            else:
-                split_dim = 0
-        if split_dim is not None:
-            print(f'*** splitting {param_name}, shape={param_data.shape}, '
-                  f'split_dim={split_dim}')
-            assert param_data.shape[split_dim] % tp == 0
-            split_size = param_data.shape[split_dim] // tp
-            splits = torch.split(param_data, split_size, dim=split_dim)
-            for i, split in enumerate(splits):
-                prefix, ext = osp.splitext(param_name)
-                save_bin(split, f'{prefix}.{i}{ext}')
-        elif copy:
-            print(f'### copying {param_name}, shape={param_data.shape}')
-            copies = [param_data] * tp
-            for i, copy in enumerate(copies):
-                prefix, ext = osp.splitext(param_name)
-                save_bin(copy, f'{prefix}.{i}{ext}')
-        else:
-            save_bin(param_data, param_name)
-
-    assert inter_size > 0
-
-    # export config and save it to {out_dir}/config.ini
-    model = MODELS.get(model_name)()
-    vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path)
-    assert _vocab_size >= vocab_size, \
-        f'different vocab size {_vocab_size} vs {vocab_size}'
-    cfg = dict(llama=dict(
-        model_name=model_name,
-        head_num=head_num,
-        kv_head_num=kv_head_num,
-        size_per_head=size_per_head,
-        vocab_size=_vocab_size,
-        num_layer=num_layer,
-        rotary_embedding=size_per_head,
-        rope_theta=rope_theta,
-        inter_size=inter_size,
-        norm_eps=norm_eps,
-        attn_bias=int(attn_bias),
-        start_id=bos_id,
-        end_id=eos_id,
-        weight_type=weight_type,
-        group_size=group_size,
-        # parameters for turbomind
-        max_batch_size=32,
-        max_context_token_num=4,
-        session_len=model.session_len + 8,
-        step_length=1,
-        cache_max_entry_count=48,
-        cache_chunk_size=1,
-        use_context_fmha=1,
-        quant_policy=0,
-        tensor_para_size=tp,
-        # extra attention params
-        max_position_embeddings=max_position_embeddings,
-        use_dynamic_ntk=int(use_dynamic_ntk),
-        use_logn_attn=int(use_logn_attn),
-    ))
-
-    config = configparser.ConfigParser()
-    for section, key_values in cfg.items():
-        config[section] = key_values
-
-    config_path = osp.join(out_dir, 'config.ini')
-    with open(config_path, 'w') as f:
-        config.write(f)
-    return True
-
-
-def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
-              dim: int):
-
-    def reshape(x):
-        return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
-
-    qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
-
-    # (input_dim, head_num + 2 * kv_head_num)
-    return qkv.view(q.size(0), -1)
-
-
-def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
-                 triton_models_path: str, tp: int):
-    """Deploy a model with huggingface transformers' format.
-
-    Args:
-        model_name (str): the name of the to-be-deployed model
-        model_path (str): the path of the directory where the model weight
-          files are
-        tokenizer_path (str): the path of the tokenizer model path
-        triton_models_path (str): the path of the exported triton models
-        tp (int): the number of tensor parallelism
-    """
-    if osp.exists(tokenizer_path):
-        shutil.copy(tokenizer_path,
-                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
-        with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'tokenizer.py'),
-                        osp.join(triton_models_path, 'tokenizer'))
-    else:
-        print(f'tokenizer model {tokenizer_path} does not exist')
-        return False
-    # read model arguments from params.json
-    try:
-        params_path = osp.join(model_path, 'params.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)
-            num_layer = model_arg['n_layers']
-            norm_eps = model_arg['norm_eps']
-            head_num = model_arg.get('n_heads', 32)
-            kv_head_num = model_arg.get('n_kv_heads', head_num)
-    except Exception as e:
-        print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}')
-        return False
-
-    # convert weights from llama to turbomind format
-    checkpoints = []
-    for pattern in ['*.pth', '*.pt']:
-        checkpoints += sorted(Path(model_path).glob(pattern))
-    print(checkpoints)
-    n_ckpt = len(checkpoints)
-    model_params = {}
-
-    def get_param(_name, _size):
-        print(_name, _size)
-        if _name not in model_params:
-            model_params[_name] = torch.zeros(_size,
-                                              dtype=torch.float16,
-                                              device='cpu')
-        return model_params[_name]
-
-    for i, ckpt_path in enumerate(checkpoints):
-        ckpt = torch.load(ckpt_path, map_location='cpu')
-        for param_name, param_data in ckpt.items():
-            key, ext = param_name.split('.')[-2:]
-            # column-parallel
-            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
-                size = param_data.size(0)
-                if ext == 'weight':
-                    param = get_param(
-                        param_name,
-                        [size * n_ckpt, param_data.size(1)])
-                    param.data[size * i:size * (i + 1), :] = param_data
-                else:  # bias
-                    param = get_param(param_name, [size * n_ckpt])
-                    param.data[size * i:size * (i + 1)] = param_data
-            # row-parallel
-            elif key in ['w2', 'wo', 'tok_embeddings']:
-                size = param_data.size(-1)
-                if ext == 'weight':
-                    param = get_param(param_name,
-                                      [param_data.size(0), size * n_ckpt])
-                    param.data[:, size * i:size * (i + 1)] = param_data
-                else:  # bias
-                    param = get_param(param_name, [size])
-                    param.data = param_data
-            elif i == 0:
-                param = get_param(param_name, param_data.size())
-                param.data = param_data
-        del ckpt
-
-    for name, param in model_params.items():
-        # transpose all weights as TurboMind is expecting column-major
-        # weights: (output_dims, input_dims) -> (input_dims, output_dims)
-        key = name.split('.')[-2]
-        if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
-            param.data = param.data.t()
-
-    # concat qkv projection
-    for t in ['weight', 'bias']:
-        for i in range(1000):
-            _qkv = [
-                f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv']
-            ]
-            try:
-                qkv = tuple(map(model_params.pop, _qkv))
-            except KeyError:
-                break
-            # concat by heads
-            qkv = merge_qkv(*qkv, tp, dim=2 if t == 'weight' else 1)
-            print(f'layers.{i}.attention.w_qkv.{t}', qkv.shape)
-            model_params[f'layers.{i}.attention.w_qkv.{t}'] = qkv
-
-    assert i == 0 or num_layer == i, f'miss matched layers: {num_layer} vs {i}'
-
-    return export(model_name, num_layer, norm_eps, kv_head_num, model_params,
-                  tokenizer_path, triton_models_path, tp)
-
-
-def permute(x: torch.Tensor):
-    SIZE_PER_HEAD = 128
-    if x.shape[-1] > 1:
-        dim = x.shape[-1]
-        n_heads = dim // SIZE_PER_HEAD
-        return x.view(-1, n_heads, 2,
-                      dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
-    else:  # scales, zeros
-        dim = x.shape[0]
-        n_heads = dim // SIZE_PER_HEAD
-        return x.view(n_heads, 2, dim // n_heads // 2,
-                      1).transpose(1, 2).reshape(dim, 1)
-
-
-def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
-              triton_models_path: str, tp: int):
-    """Deploy a model with huggingface transformers' format.
-
-    Args:
-        model_name (str): the name of the to-be-deployed model
-        model_path (str): the path of the directory where the model weight
-          files are
-        tokenizer_path (str): the path of the tokenizer model path
-        triton_models_path (str): the path of the exported triton models
-        tp (int): the number of tensor parallelism
-    """
-    if tokenizer_path is None:
-        tokenizer_path = osp.join(model_path, 'tokenizer.model')
-    if osp.exists(tokenizer_path):
-        shutil.copy(tokenizer_path,
-                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
-        for _file in os.listdir(model_path):
-            if _file.endswith('.json') or _file.endswith('.py'):
-                json_path = osp.join(model_path, _file)
-                shutil.copy(json_path,
-                            osp.join(triton_models_path, 'tokenizer', _file))
-        with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'tokenizer.py'),
-                        osp.join(triton_models_path, 'tokenizer'))
-    else:
-        print(f'tokenizer model {tokenizer_path} does not exist')
-        exit(-1)
-
-    # read model arguments from params.json
-    try:
-        params_path = osp.join(model_path, 'config.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)
-            num_layer = model_arg['num_hidden_layers']
-            norm_eps = model_arg['rms_norm_eps']
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            max_position_embeddings = int(
-                model_arg.get('max_position_embeddings', 0))
-            repo_scaling = bool(model_arg.get('rope_scaling', False))
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg['num_key_value_heads']
-            else:
-                kv_head_num = model_arg['num_attention_heads']
-    except Exception as e:
-        print(f'get "num_hidden_layers" and "rms_norm_eps" from '
-              f'{params_path} failed: {e}')
-        return False
-
-    # convert weights from hf to turbomind
-    model_params = {}
-
-    _qweight = 'weight'
-    _suffixes = [_qweight, 'bias']
-
-    _params = load_checkpoint(model_path)
-
-    def get_tensor(name):
-        """return tensor according its name."""
-        return _params[name]
-
-    def get_tensor_transposed(name: str):
-        """return a transposed tensor according its name."""
-        if name not in _params and name.find('bias'):
-            return None
-        return _params[name].t()
-
-    w_pack = False
-    if 'model.layers.0.self_attn.W_pack.weight' in _params:
-        w_pack = True
-
-    for i in range(1000):
-        try:
-            # attention weights
-            for suffix in _suffixes:
-                if w_pack:
-                    _qkvo = [
-                        f'model.layers.{i}.self_attn.{t}'
-                        for t in ['W_pack', 'o_proj']
-                    ]
-                    qkv, o = map(get_tensor_transposed,
-                                 map(('{}.' + suffix).format, _qkvo))
-
-                    if qkv is None:
-                        continue
-                    _shape = qkv.shape[1] // 3
-                    _qkv = torch.split(qkv, [_shape, _shape, _shape], dim=1)
-                    q = _qkv[0]
-                    k = _qkv[1]
-                    v = _qkv[2]
-
-                else:
-                    _qkvo = [
-                        f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo'
-                    ]
-                    q, k, v, o = map(get_tensor_transposed,
-                                     map(('{}.' + suffix).format, _qkvo))
-                if q is None:
-                    continue
-                # q, k has different layout for fb & hf, convert to fb's
-                # layout
-                q = permute(q)
-                k = permute(k)
-                if suffix == _qweight:  # weight, qweight
-                    qkv = merge_qkv(q, k, v, tp, dim=2)
-                    print(suffix, qkv.shape)
-                else:  # scales, zeros, bias
-                    qkv = merge_qkv(q, k, v, tp, dim=1)
-                    print(suffix, qkv.shape)
-                for k, v in [('w_qkv', qkv), ('wo', o)]:
-                    model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
-            # ffn weights
-            _w123 = [
-                f'model.layers.{i}.mlp.{t}_proj'
-                for t in ['gate', 'down', 'up']
-            ]
-            for suffix in _suffixes:
-                w1, w2, w3 = map(get_tensor_transposed,
-                                 map(('{}.' + suffix).format, _w123))
-                if w1 is None:
-                    continue
-                if suffix in ['scales', 'zeros', 'bias']:
-                    w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
-                for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
-                    model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
-            other = [('attention_norm.weight', 'input_layernorm.weight'),
-                     ('ffn_norm.weight', 'post_attention_layernorm.weight')]
-            for ft, hf in other:
-                model_params[f'layers.{i}.' +
-                             ft] = get_tensor(f'model.layers.{i}.' + hf)
-        except safetensors.SafetensorError:
-            break
-        except KeyError:
-            break
-
-    assert num_layer == i, f'miss matched layers: {num_layer} vs {i}'
-
-    other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
-             ('norm.weight', 'model.norm.weight'),
-             ('output.weight', 'lm_head.weight')]
-    for ft, hf in other:
-        model_params[ft] = get_tensor(hf)
-
-    if model_name == 'baichuan2-7b':
-        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py#L507
-        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
-        model_params['output.weight'] = torch.nn.functional.normalize(
-            model_params['output.weight'])
-
-    return export(model_name,
-                  num_layer,
-                  norm_eps,
-                  kv_head_num,
-                  model_params,
-                  tokenizer_path,
-                  triton_models_path,
-                  tp,
-                  max_position_embeddings=max_position_embeddings,
-                  use_dynamic_ntk=repo_scaling,
-                  rope_theta=rope_theta)
-
-
-def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
-               triton_models_path: str, tp: int, quant_path: str,
-               group_size: int):
-    """Deploy a model with huggingface transformers' format.
-
-    Args:
-        model_name (str): the name of the to-be-deployed model
-        model_path (str): the path of the directory where the model weight
-          files are
-        tokenizer_path (str): the path of the tokenizer model path
-        triton_models_path (str): the path of the exported triton models
-        tp (int): the number of tensor parallelism
-        quant_path (str): path of the quantized model, which can be None
-        group_size (int): a parameter used in AWQ to quantize fp16 weights
-            to 4 bits
-    """
-    if tokenizer_path is None:
-        tokenizer_path = osp.join(model_path, 'tokenizer.model')
-    if osp.exists(tokenizer_path):
-        shutil.copy(tokenizer_path,
-                    osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
-        for _file in os.listdir(model_path):
-            if _file.endswith('.json') or _file.endswith('.py'):
-                json_path = osp.join(model_path, _file)
-                shutil.copy(json_path,
-                            osp.join(triton_models_path, 'tokenizer', _file))
-        with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'tokenizer.py'),
-                        osp.join(triton_models_path, 'tokenizer'))
-    else:
-        print(f'tokenizer model {tokenizer_path} does not exist')
-        exit(-1)
-
-    # read model arguments from params.json
-    try:
-        params_path = osp.join(model_path, 'config.json')
-        with open(params_path) as f:
-            model_arg = json.load(f)
-            num_layer = model_arg['num_hidden_layers']
-            norm_eps = model_arg['rms_norm_eps']
-            rope_theta = float(model_arg.get('rope_theta', 10000.0))
-            if 'num_key_value_heads' in model_arg:
-                kv_head_num = model_arg['num_key_value_heads']
-            else:
-                kv_head_num = model_arg['num_attention_heads']
-    except Exception as e:
-        print(f'get "num_hidden_layers" and "rms_norm_eps" from '
-              f'{params_path} failed: {e}')
-        return False
-
-    # convert weights from hf to turbomind
-    if quant_path is None:
-        _files = [
-            osp.join(model_path, file) for file in os.listdir(model_path)
-            if file.endswith('.bin')
-        ]
-        _files = sorted(_files)
-    else:
-        _files = [quant_path]
-
-    model_params = {}
-
-    _params = {}
-    for _file in _files:
-        _tmp = torch.load(_file, map_location='cpu')
-        _params.update(_tmp)
-
-    def get_tensor(name):
-        """return tensor according its name."""
-        return _params[name].cuda().contiguous()
-
-    # import _turbomind as _tm
-    # TODO: find another way import _turbomind
-    lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
-    sys.path.append(osp.join(lmdeploy_dir, 'lib'))
-    import _turbomind as _tm  # noqa: E402
-
-    def transpose_qk_s4(src: torch.Tensor):
-        assert src.is_contiguous()
-        dst = torch.zeros_like(src)
-        _tm.transpose_qk_s4_k_m8(src, dst,
-                                 src.size(-1) * 8, src.size(0), group_size)
-        return dst
-
-    def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor,
-                      w1_s: torch.Tensor, w3_qw: torch.Tensor,
-                      w3_qz: torch.Tensor, w3_s: torch.Tensor):
-
-        def fuse(a: torch.Tensor, b: torch.Tensor):
-            ab = torch.cat((a, b)).contiguous()
-            _ab = torch.zeros_like(ab)
-            _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
-            return _ab.view(a.size(0), -1)
-
-        w13_qw = fuse(w1_qw, w3_qw)
-        w13_qz = fuse(w1_qz, w3_qz)
-
-        w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
-        w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
-
-        return w13_qw, w13_qz, w13_s
-
-    def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
-                   group_size: int):
-        assert qw.is_contiguous()
-        assert qz.is_contiguous()
-        assert s.is_contiguous()
-        _qw = torch.zeros_like(qw)
-        _sz = torch.zeros_like(s, dtype=torch.int32)  # half2
-        _ws = torch.zeros_like(s)
-        _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
-                            qw.size(-1) * 8, qw.size(0), group_size)
-        return _qw, _sz
-
-    def tp_m_s4(x: torch.Tensor, tp: int):
-        return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
-                                                            1).contiguous()
-
-    attn_bias = False
-
-    for i in range(num_layer):
-        print(i)
-
-        # attention weights
-        q_qw = get_tensor(f'model.layers.{i}.self_attn.q_proj.qweight')
-        k_qw = get_tensor(f'model.layers.{i}.self_attn.k_proj.qweight')
-        v_qw = get_tensor(f'model.layers.{i}.self_attn.v_proj.qweight')
-        o_qw = get_tensor(f'model.layers.{i}.self_attn.o_proj.qweight')
-
-        q_qz = get_tensor(f'model.layers.{i}.self_attn.q_proj.qzeros')
-        k_qz = get_tensor(f'model.layers.{i}.self_attn.k_proj.qzeros')
-        v_qz = get_tensor(f'model.layers.{i}.self_attn.v_proj.qzeros')
-        o_qz = get_tensor(f'model.layers.{i}.self_attn.o_proj.qzeros')
-
-        q_s = get_tensor(f'model.layers.{i}.self_attn.q_proj.scales')
-        k_s = get_tensor(f'model.layers.{i}.self_attn.k_proj.scales')
-        v_s = get_tensor(f'model.layers.{i}.self_attn.v_proj.scales')
-        o_s = get_tensor(f'model.layers.{i}.self_attn.o_proj.scales')
-
-        try:
-            q_b = get_tensor(f'model.layers.{i}.self_attn.q_proj.bias')
-            k_b = get_tensor(f'model.layers.{i}.self_attn.k_proj.bias')
-            v_b = get_tensor(f'model.layers.{i}.self_attn.v_proj.bias')
-            o_b = get_tensor(f'model.layers.{i}.self_attn.o_proj.bias')
-            attn_bias = True
-        except:  # noqa: E722
-            pass
-
-        q_qw = transpose_qk_s4(q_qw)
-        k_qw = transpose_qk_s4(k_qw)
-        q_qz = transpose_qk_s4(q_qz)
-        k_qz = transpose_qk_s4(k_qz)
-        q_s = permute(q_s)
-        k_s = permute(k_s)
-
-        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
-        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
-        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
-
-        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
-
-        qkv_qw = tp_m_s4(qkv_qw, tp)
-
-        model_params[f'layers.{i}.attention.w_qkv.qweight'] = qkv_qw
-        model_params[f'layers.{i}.attention.w_qkv.scales_zeros'] = qkv_sz
-
-        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
-
-        model_params[f'layers.{i}.attention.wo.qweight'] = o_qw
-        model_params[f'layers.{i}.attention.wo.scales_zeros'] = o_sz
-
-        if attn_bias:
-            q_b = permute(q_b)
-            k_b = permute(k_b)
-            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
-            model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
-            model_params[f'layers.{i}.attention.wo.bias'] = o_b
-
-        # ffn weights
-        w1_qw = get_tensor(f'model.layers.{i}.mlp.gate_proj.qweight')
-        w2_qw = get_tensor(f'model.layers.{i}.mlp.down_proj.qweight')
-        w3_qw = get_tensor(f'model.layers.{i}.mlp.up_proj.qweight')
-
-        w1_qz = get_tensor(f'model.layers.{i}.mlp.gate_proj.qzeros')
-        w2_qz = get_tensor(f'model.layers.{i}.mlp.down_proj.qzeros')
-        w3_qz = get_tensor(f'model.layers.{i}.mlp.up_proj.qzeros')
-
-        w1_s = get_tensor(f'model.layers.{i}.mlp.gate_proj.scales')
-        w2_s = get_tensor(f'model.layers.{i}.mlp.down_proj.scales')
-        w3_s = get_tensor(f'model.layers.{i}.mlp.up_proj.scales')
-
-        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
-                                              w3_s)
-
-        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
-        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
-
-        w13_qw = tp_m_s4(w13_qw, tp)
-
-        model_params[f'layers.{i}.feed_forward.w13.qweight'] = w13_qw
-        model_params[f'layers.{i}.feed_forward.w13.scales_zeros'] = w13_sz
-
-        model_params[f'layers.{i}.feed_forward.w2.qweight'] = w2_qw
-        model_params[f'layers.{i}.feed_forward.w2.scales_zeros'] = w2_sz
-
-        # norm weights
-        attn_norm = get_tensor(f'model.layers.{i}.input_layernorm.weight')
-        ffn_norm = get_tensor(
-            f'model.layers.{i}.post_attention_layernorm.weight')
-
-        model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
-        model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm
-
-    other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
-             ('norm.weight', 'model.norm.weight'),
-             ('output.weight', 'lm_head.weight')]
-    for ft, hf in other:
-        model_params[ft] = get_tensor(hf)
-
-    return export(model_name,
-                  num_layer,
-                  norm_eps,
-                  kv_head_num,
-                  model_params,
-                  tokenizer_path,
-                  triton_models_path,
-                  tp,
-                  weight_type='int4',
-                  group_size=group_size,
-                  rope_theta=rope_theta)
-
-
-def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
-                triton_models_path: str, tp: int):
-    """Deploy a model with huggingface transformers' format.
-
-    Args:
-        model_name (str): the name of the to-be-deployed model
-        model_path (str): the path of the directory where the model weight
-          files are
-        tokenizer_path (str): the path of the tokenizer model path
-        triton_models_path (str): the path of the exported triton models
-        tp (int): the number of tensor parallelism
-        quant_path (str): path of the quantized model, which can be None
-        group_size (int): a parameter used in AWQ to quantize fp16 weights
-            to 4 bits
-    """
-
-    if osp.exists(model_path):
-        shutil.copy(osp.join(model_path, 'qwen.tiktoken'),
-                    osp.join(triton_models_path, 'tokenizer'))
-        for _file in os.listdir(model_path):
-            if _file.endswith('.json') or _file.endswith('.py'):
-                json_path = osp.join(model_path, _file)
-                shutil.copy(json_path,
-                            osp.join(triton_models_path, 'tokenizer', _file))
-        with get_package_root_path() as root_path:
-            shutil.copy(osp.join(root_path, 'tokenizer.py'),
-                        osp.join(triton_models_path, 'tokenizer'))
-    else:
-        print(f'tokenizer model {tokenizer_path} does not exist')
-        exit(-1)
-
-    # read model arguments from params.json
-    try:
-        params_path = osp.join(model_path, 'config.json')
-        with open(params_path) as f:
-            config = json.load(f)
-            num_layer = config['num_hidden_layers']
-            norm_eps = config['layer_norm_epsilon']
-            rope_theta = float(config.get('rotary_emb_base', 10000.0))
-            if 'num_key_value_heads' in config:
-                kv_head_num = config['num_key_value_heads']
-            else:
-                kv_head_num = config['num_attention_heads']
-            seq_length = config['seq_length']
-            use_dynamic_ntk = config['use_dynamic_ntk']
-            use_logn_attn = config['use_logn_attn']
-    except Exception as e:
-        print(f'get "num_hidden_layers" and "layer_norm_epsilon" from '
-              f'{params_path} failed: {e}')
-        return False
-
-    # convert weights from hf to turbomind
-    model_params = {}
-
-    _params = load_checkpoint(model_path)
-
-    def get_tensor(name, trans=True):
-        """return a transposed tensor according its name."""
-        if trans:
-            return _params[name].cuda().t()
-        else:
-            return _params[name].cuda()
-
-    for i in range(num_layer):
-        print(i)
-
-        # qkv weights
-        qkv_w = get_tensor(f'transformer.h.{i}.attn.c_attn.weight')
-        q_w, k_w, v_w = torch.split(qkv_w, qkv_w.size(-1) // 3, dim=-1)
-        q_w, k_w = permute(q_w), permute(k_w)
-        qkv_w = merge_qkv(q_w, k_w, v_w, tp, dim=2)
-        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv_w
-
-        # qkv bias
-        qkv_b = get_tensor(f'transformer.h.{i}.attn.c_attn.bias')
-        q_b, k_b, v_b = torch.split(qkv_b, qkv_b.size(-1) // 3)
-        q_b, k_b = permute(q_b), permute(k_b)
-        qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
-        model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
-
-        # o weights
-        o_w = get_tensor(f'transformer.h.{i}.attn.c_proj.weight')
-        model_params[f'layers.{i}.attention.wo.weight'] = o_w
-        model_params[f'layers.{i}.attention.wo.bias'] = torch.zeros_like(q_b)
-
-        # ffn weights
-        # ours: w2(silu(w1(x)) * w3(x))
-        # qwen: c_proj(w1(x) * silu(w2(x)))
-        w1 = get_tensor(f'transformer.h.{i}.mlp.w2.weight')
-        w3 = get_tensor(f'transformer.h.{i}.mlp.w1.weight')
-        w2 = get_tensor(f'transformer.h.{i}.mlp.c_proj.weight')
-        model_params[f'layers.{i}.feed_forward.w1.weight'] = w1
-        model_params[f'layers.{i}.feed_forward.w2.weight'] = w2
-        model_params[f'layers.{i}.feed_forward.w3.weight'] = w3
-
-        # norm weights
-        attn_norm = get_tensor(f'transformer.h.{i}.ln_1.weight')
-        ffn_norm = get_tensor(f'transformer.h.{i}.ln_2.weight')
-
-        model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
-        model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm
-
-    other = [('tok_embeddings.weight', 'transformer.wte.weight'),
-             ('norm.weight', 'transformer.ln_f.weight'),
-             ('output.weight', 'lm_head.weight')]
-    for ft, hf in other:
-        model_params[ft] = get_tensor(hf, trans=False)
-
-    return export(model_name,
-                  num_layer,
-                  norm_eps,
-                  kv_head_num,
-                  model_params,
-                  model_path,
-                  triton_models_path,
-                  tp,
-                  max_position_embeddings=seq_length,
-                  use_dynamic_ntk=use_dynamic_ntk,
-                  use_logn_attn=use_logn_attn,
-                  rope_theta=rope_theta,
-                  tokenizer_info=tokenizer_info_qwen)
-
-
-def pack_model_repository(workspace_path: str):
-    """package the model repository.
-
-    Args:
-        workspace_path: the path of workspace
-    """
-    os.symlink(src='../../tokenizer',
-               dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
-                            '1', 'tokenizer'))
-    os.symlink(src='../../tokenizer',
-               dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
-                            '1', 'tokenizer'))
-    os.symlink(src='../../weights',
-               dst=osp.join(workspace_path, 'triton_models', 'interactive',
-                            '1', 'weights'))
-    model_repo_dir = osp.join(workspace_path, 'model_repository')
-    os.makedirs(model_repo_dir, exist_ok=True)
-    os.symlink(src=osp.join('../triton_models/interactive'),
-               dst=osp.join(model_repo_dir, 'turbomind'))
-    os.symlink(src=osp.join('../triton_models/preprocessing'),
-               dst=osp.join(model_repo_dir, 'preprocessing'))
-    os.symlink(src=osp.join('../triton_models/postprocessing'),
-               dst=osp.join(model_repo_dir, 'postprocessing'))
-
-
-def main(model_name: str,
-         model_path: str,
-         model_format: str = None,
-         tokenizer_path: str = None,
-         dst_path: str = './workspace',
-         tp: int = 1,
-         quant_path: str = None,
-         group_size: int = 0):
-    """deploy llama family models via turbomind.
-
-    Args:
-        model_name (str): the name of the to-be-deployed model, such as
-            llama-7b, llama-13b, vicuna-7b and etc
-        model_path (str): the directory path of the model
-        model_format (str): the format of the model, fb or hf. 'fb' stands for
-            META's llama format, and 'hf' means huggingface format
-        tokenizer_path (str): the path of tokenizer model
-        dst_path (str): the destination path that saves outputs
-        tp (int): the number of GPUs used for tensor parallelism, should be 2^n
-        quant_path (str): path of the quantized model, which can be None
-        group_size (int): a parameter used in AWQ to quantize fp16 weights
-            to 4 bits
-    """
-    assert model_name in MODELS.module_dict.keys(), \
-        f"'{model_name}' is not supported. " \
-        f'The supported models are: {MODELS.module_dict.keys()}'
-
-    assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
-
-    if model_format is None:
-        model_format = 'qwen' if model_name == 'qwen-7b' else 'hf'
-
-    if model_format not in supported_formats:
-        print(f'the model format "{model_format}" is not supported. '
-              f'The supported format are: {supported_formats}')
-        exit(-1)
-
-    if model_format == 'llama' and tokenizer_path is None:
-        print('The model is llama. Its tokenizer model path should be '
-              'specified')
-        exit(-1)
-
-    if not create_workspace(dst_path):
-        exit(-1)
-
-    triton_models_path = copy_triton_model_templates(dst_path)
-    if triton_models_path is None:
-        exit(-1)
-
-    if model_format == 'llama':
-        res = deploy_llama(model_name, model_path, tokenizer_path,
-                           triton_models_path, tp)
-    elif model_format == 'hf':
-        res = deploy_hf(model_name, model_path, tokenizer_path,
-                        triton_models_path, tp)
-    elif model_format == 'awq':
-        res = deploy_awq(model_name, model_path, tokenizer_path,
-                         triton_models_path, tp, quant_path, group_size)
-    elif model_format == 'qwen':
-        res = deploy_qwen(model_name, model_path, tokenizer_path,
-                          triton_models_path, tp)
-
-    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
-    with open(osp.join(triton_models_path, 'interactive/config.pbtxt'),
-              'a') as f:
-        param = \
-            'parameters {\n  key: "tensor_para_size"\n  value: {\n    ' \
-            'string_value: ' + f'"{tp}"\n' + '  }\n}\n' + \
-            'parameters {\n  key: "model_name"\n  value: {\n    ' \
-            'string_value: ' + f'"{model_name}"\n' + '  }\n}\n'
-        f.write(param)
-    if not res:
-        print(f'deploy model "{model_name}" via turbomind failed')
-        destroy_workspace(dst_path)
-        exit(-1)
-
-    # pack model repository for triton inference server
-    pack_model_repository(dst_path)
-
-    # update the value of $TP in `service_docker_up.sh`
-    file_path = osp.join(dst_path, 'service_docker_up.sh')
-    with open(file_path, 'r') as f:
-        content = f.read()
-        content = re.sub('TP=1', f'TP={tp}', content)
-    with open(file_path, 'w') as f:
-        f.write(content)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)
diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
index 296d453ed4..231601fde0 100644
--- a/lmdeploy/tokenizer.py
+++ b/lmdeploy/tokenizer.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
+import os
 import os.path as osp
 from typing import Optional, Sequence, Union
 
@@ -16,7 +17,7 @@ class SentencePieceTokenizer:
     def __init__(self, model_file: str):
         from sentencepiece import SentencePieceProcessor
         self.model = SentencePieceProcessor(model_file=model_file)
-        self._no_prefix_space_tokens = None
+        self._prefix_space_tokens = None
 
     @property
     def vocab_size(self):
@@ -34,19 +35,20 @@ def eos_token_id(self):
         return self.model.eos_id()
 
     @property
-    def no_prefix_space_tokens(self):
+    def prefix_space_tokens(self):
         """tokens without prefix space."""
-        if self._no_prefix_space_tokens is None:
+        if self._prefix_space_tokens is None:
             vocab = self.model.IdToPiece(list(range(self.vocab_size)))
-            self._no_prefix_space_tokens = {
+            self._prefix_space_tokens = {
                 i
-                for i, tok in enumerate(vocab) if not tok.startswith('▁')
+                for i, tok in enumerate(vocab) if tok.startswith('▁')
             }
-        return self._no_prefix_space_tokens
+        return self._prefix_space_tokens
 
     def _maybe_add_prefix_space(self, tokens, decoded):
         """maybe add prefix space for incremental decoding."""
-        if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
+        if len(tokens) and not decoded.startswith(' ') and\
+                tokens[0] in self.prefix_space_tokens:
             return ' ' + decoded
         else:
             return decoded
@@ -111,8 +113,7 @@ class HuggingFaceTokenizer:
     """
 
     def __init__(self, model_dir: str, trust_remote_code=True):
-        from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
-                                  LlamaTokenizerFast)
+        from transformers import AutoTokenizer
         model_file = osp.join(model_dir, 'tokenizer.model')
         backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
         model_file_exists = osp.exists(model_file)
@@ -121,20 +122,22 @@ def __init__(self, model_dir: str, trust_remote_code=True):
                   'It may take long time to initialize the tokenizer.')
         self.model = AutoTokenizer.from_pretrained(
             model_dir, trust_remote_code=trust_remote_code)
-        self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
-            or isinstance(self.model, CodeLlamaTokenizerFast)
-        self._no_prefix_space_tokens = None
+        self._prefix_space_tokens = None
         # save tokenizer.json to reuse
         if not osp.exists(backend_tokenizer_file) and model_file_exists:
             if hasattr(self.model, 'backend_tokenizer'):
-                self.model.backend_tokenizer.save(backend_tokenizer_file)
+                if os.access(model_dir, os.W_OK):
+                    self.model.backend_tokenizer.save(backend_tokenizer_file)
 
         if self.model.eos_token_id is None:
             generation_config_file = osp.join(model_dir,
                                               'generation_config.json')
-            with open(generation_config_file, 'r') as f:
-                cfg = json.load(f)
-                self.model.eos_token_id = cfg['eos_token_id']
+            if osp.exists(generation_config_file):
+                with open(generation_config_file, 'r') as f:
+                    cfg = json.load(f)
+                    self.model.eos_token_id = cfg['eos_token_id']
+            elif hasattr(self.model, 'eod_id'):  # Qwen remote
+                self.model.eos_token_id = self.model.eod_id
 
     @property
     def vocab_size(self):
@@ -152,21 +155,22 @@ def eos_token_id(self):
         return self.model.eos_token_id
 
     @property
-    def no_prefix_space_tokens(self):
+    def prefix_space_tokens(self):
         """tokens without prefix space."""
-        if self._no_prefix_space_tokens is None:
+        if self._prefix_space_tokens is None:
             vocab = self.model.convert_ids_to_tokens(
                 list(range(self.vocab_size)))
-            self._no_prefix_space_tokens = {
+            self._prefix_space_tokens = {
                 i
-                for i, tok in enumerate(vocab) if not tok.startswith('▁')
+                for i, tok in enumerate(vocab)
+                if tok.startswith('▁' if isinstance(tok, str) else b' ')
             }
-        return self._no_prefix_space_tokens
+        return self._prefix_space_tokens
 
     def _maybe_add_prefix_space(self, tokens, decoded):
         """maybe add prefix space for incremental decoding."""
-        if self.need_padding and len(
-                tokens) and tokens[0] not in self.no_prefix_space_tokens:
+        if len(tokens) and not decoded.startswith(' ') and\
+                tokens[0] in self.prefix_space_tokens:
             return ' ' + decoded
         else:
             return decoded
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index de31a5daa7..8091dd29b4 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,12 +4,6 @@
 import os.path as osp
 import random
 
-import fire
-
-from lmdeploy import turbomind as tm
-from lmdeploy.model import MODELS
-from lmdeploy.tokenizer import Tokenizer
-
 os.environ['TM_LOG_LEVEL'] = 'ERROR'
 
 
@@ -73,9 +67,9 @@ def get_gen_param(cap,
 def main(model_path,
          session_id: int = 1,
          cap: str = 'chat',
-         sys_instruct: str = None,
-         tp=1,
-         stream_output=True,
+         tp: int = 1,
+         stream_output: bool = True,
+         request_output_len: int = 512,
          **kwargs):
     """An example to perform model inference through the command line
     interface.
@@ -85,24 +79,27 @@ def main(model_path,
         session_id (int): the identical id of a session
         cap (str): the capability of a model. For example, codellama has
             the ability among ['completion', 'infilling', 'chat', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
         tp (int): GPU number used in tensor parallelism
         stream_output (bool): indicator for streaming output or not
         **kwarg (dict): other arguments for initializing model's chat template
     """
+    from lmdeploy import turbomind as tm
+    from lmdeploy.tokenizer import Tokenizer
+
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
-    tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
+    tm_model = tm.TurboMind(model_path,
+                            eos_id=tokenizer.eos_token_id,
+                            tp=tp,
+                            capability=cap,
+                            **kwargs)
     generator = tm_model.create_instance()
 
     nth_round = 1
     step = 0
     seed = random.getrandbits(64)
     model_name = tm_model.model_name
-    model = MODELS.get(model_name)(capability=cap, **kwargs) \
-        if sys_instruct is None else MODELS.get(model_name)(
-            capability=cap, system=sys_instruct, **kwargs)
+    model = tm_model.model
 
     print(f'session {session_id}')
     while True:
@@ -112,12 +109,13 @@ def main(model_path,
         elif prompt == 'end':
             prompt = model.get_prompt('', nth_round == 1)
             input_ids = tokenizer.encode(prompt)
-            for outputs in generator.stream_infer(session_id=session_id,
-                                                  input_ids=[input_ids],
-                                                  request_output_len=512,
-                                                  sequence_start=False,
-                                                  sequence_end=True,
-                                                  stream_output=stream_output):
+            for outputs in generator.stream_infer(
+                    session_id=session_id,
+                    input_ids=[input_ids],
+                    request_output_len=request_output_len,
+                    sequence_start=False,
+                    sequence_end=True,
+                    stream_output=stream_output):
                 pass
             nth_round = 1
             step = 0
@@ -125,13 +123,14 @@ def main(model_path,
         else:
             prompt = model.get_prompt(prompt, nth_round == 1)
             input_ids = tokenizer.encode(prompt)
-            if step + len(input_ids) >= tm_model.session_len:
+            if step + len(
+                    input_ids) + request_output_len >= tm_model.session_len:
                 print('WARNING: exceed session max length.'
                       ' Please end the session.')
                 continue
 
             gen_param = get_gen_param(cap, model.sampling_param, nth_round,
-                                      step, **kwargs)
+                                      step, request_output_len, **kwargs)
 
             print(f'{prompt} ', end='', flush=True)
             response_size = 0
@@ -145,6 +144,11 @@ def main(model_path,
                 res, tokens = outputs[0]
                 # decode res
                 response = tokenizer.decode(res.tolist(), offset=response_size)
+                # utf-8 char at the end means it's a potential unfinished
+                # byte sequence, continue to concate it with the next
+                # sequence and decode them together
+                if response.endswith('�'):
+                    continue
                 response = valid_str(response)
                 print(f'{response}', end='', flush=True)
                 response_size = tokens
@@ -157,4 +161,6 @@ def main(model_path,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py
index daef35298c..5ba4675c59 100644
--- a/lmdeploy/turbomind/decode.py
+++ b/lmdeploy/turbomind/decode.py
@@ -2,7 +2,6 @@
 import os
 import os.path as osp
 
-import fire
 import torch
 
 from lmdeploy import turbomind as tm
@@ -37,4 +36,6 @@ def main(model_path, inputs):
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/deploy/__init__.py b/lmdeploy/turbomind/deploy/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
new file mode 100644
index 0000000000..4876002020
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import re
+import shutil
+from pathlib import Path
+
+import fire
+
+from lmdeploy.model import MODELS
+
+from .source_model.base import INPUT_MODELS
+from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
+
+supported_formats = ['llama', 'hf', 'awq', None]
+special_input_model_map = {
+    'qwen': 'qwen',
+    'baichuan': 'baichuan',
+    'baichuan2': 'baichuan2'
+}
+
+
+def get_package_root_path():
+    """Get lmdeploy root path."""
+    import lmdeploy
+    return Path(lmdeploy.__file__).parent
+
+
+def get_tokenizer_path(model_path: str, tokenizer_path: str):
+    """Get tokenizer path if not given."""
+    if tokenizer_path is not None:
+        assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
+        return tokenizer_path
+    candidate = ['tokenizer.model', 'qwen.tiktoken']
+    for name in candidate:
+        tmp_path = osp.join(model_path, name)
+        if osp.exists(tmp_path):
+            tokenizer_path = tmp_path
+            break
+    assert tokenizer_path, 'please supply tokenizer path by --tokenizer-path'
+    return tokenizer_path
+
+
+def get_model_format(model_name: str, model_format: str):
+    """Get model format if not given or equal awq."""
+    # get model name prefix
+    if model_name.find('-') != -1:
+        model_name = model_name[:model_name.find('-')]
+    # rules:
+    # 1) llama -> match special -> hf (if not matched)
+    # 2) append awq (if model_format is awq)
+    inferred_model_format = model_format
+    if model_format in [None, 'hf']:
+        inferred_model_format = special_input_model_map.get(model_name, 'hf')
+    elif model_format == 'awq':
+        inferred_model_format = special_input_model_map.get(model_name,
+                                                            'hf') + '-awq'
+    return inferred_model_format
+
+
+def create_workspace(_path: str):
+    """Create a workspace.
+
+    Args:
+        _path (str): the path of the workspace
+    """
+    if osp.exists(_path):
+        print(f'remove workspace in directory {_path}')
+        shutil.rmtree(_path)
+    print(f'create workspace in directory {_path}')
+    os.makedirs(_path)
+
+
+def copy_triton_model_templates(_path: str):
+    """copy triton model templates to the specified path.
+
+    Args:
+        _path (str): the target path
+    Returns:
+        str: the path of the triton models
+    """
+
+    root = get_package_root_path()
+    dir_path = osp.join(root, 'serve', 'turbomind')
+    triton_models_path = osp.join(dir_path, 'triton_models')
+    dst_path = osp.join(_path, 'triton_models')
+    print(f'copy triton model templates from "{triton_models_path}" to '
+          f'"{dst_path}"')
+    shutil.copytree(triton_models_path, dst_path, symlinks=True)
+    service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh')
+    print(f'copy service_docker_up.sh from "{service_docker_up_file}" to '
+          f'"{_path}"')
+    shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
+    return dst_path
+
+
+def copy_tokenizer(model_path: str, tokenizer_path: str,
+                   triton_models_path: str):
+    """Copy tokenizer."""
+    shutil.copy(
+        tokenizer_path,
+        osp.join(triton_models_path,
+                 osp.join('tokenizer', osp.basename(tokenizer_path))))
+    for _file in os.listdir(model_path):
+        if _file.endswith('.json') or _file.endswith('.py'):
+            json_path = osp.join(model_path, _file)
+            shutil.copy(json_path,
+                        osp.join(triton_models_path, 'tokenizer', _file))
+    with get_package_root_path() as root_path:
+        shutil.copy(osp.join(root_path, 'tokenizer.py'),
+                    osp.join(triton_models_path, 'tokenizer'))
+
+
+def pack_model_repository(workspace_path: str):
+    """package the model repository.
+
+    Args:
+        workspace_path: the path of workspace
+    """
+    os.symlink(src=osp.join('..', '..', 'tokenizer'),
+               dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
+                            '1', 'tokenizer'))
+    os.symlink(src=osp.join('..', '..', 'tokenizer'),
+               dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
+                            '1', 'tokenizer'))
+    os.symlink(src=osp.join('..', '..', 'weights'),
+               dst=osp.join(workspace_path, 'triton_models', 'interactive',
+                            '1', 'weights'))
+    model_repo_dir = osp.join(workspace_path, 'model_repository')
+    os.makedirs(model_repo_dir, exist_ok=True)
+    os.symlink(src=osp.join('..', 'triton_models', 'interactive'),
+               dst=osp.join(model_repo_dir, 'turbomind'))
+    os.symlink(src=osp.join('..', 'triton_models', 'preprocessing'),
+               dst=osp.join(model_repo_dir, 'preprocessing'))
+    os.symlink(src=osp.join('..', 'triton_models', 'postprocessing'),
+               dst=osp.join(model_repo_dir, 'postprocessing'))
+
+
+def main(model_name: str,
+         model_path: str,
+         model_format: str = None,
+         tokenizer_path: str = None,
+         dst_path: str = 'workspace',
+         tp: int = 1,
+         quant_path: str = None,
+         group_size: int = 0):
+    """deploy llama family models via turbomind.
+
+    Args:
+        model_name (str): the name of the to-be-deployed model, such as
+            llama-7b, llama-13b, vicuna-7b and etc
+        model_path (str): the directory path of the model
+        model_format (str): the format of the model, should choose from
+            ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+            format, 'hf' means huggingface llama format, and 'awq' means
+            llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+            the default value is None, which means the model_format will be
+            inferred based on model_name
+        tokenizer_path (str): the path of tokenizer model
+        dst_path (str): the destination path that saves outputs
+        tp (int): the number of GPUs used for tensor parallelism, should be 2^n
+        quant_path (str): Path of the quantized model, which can be None.
+        group_size (int): a parameter used in AWQ to quantize fp16 weights
+            to 4 bits
+    """
+
+    assert model_name in MODELS.module_dict.keys(), \
+        f"'{model_name}' is not supported. " \
+        f'The supported models are: {MODELS.module_dict.keys()}'
+
+    assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
+
+    output_format = 'fp16'
+
+    # get input model format
+    assert model_format in supported_formats, 'the model format ' \
+        f'should be in {supported_formats}'
+
+    inferred_model_format = get_model_format(model_name, model_format)
+    if inferred_model_format not in INPUT_MODELS.module_dict.keys():
+        supported_keys = list(INPUT_MODELS.module_dict.keys())
+        print(f'with model name {model_name} and model formst {model_format}, '
+              f'the inferred model format is {inferred_model_format}, '
+              f'which is not in supported list {supported_keys}')
+        exit(-1)
+
+    # get tokenizer path
+    tokenizer_path = get_tokenizer_path(model_path, tokenizer_path)
+
+    # create workspace
+    create_workspace(dst_path)
+
+    triton_models_path = copy_triton_model_templates(dst_path)
+
+    copy_tokenizer(model_path, tokenizer_path, triton_models_path)
+
+    # turbomind config
+    cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
+    cfg.model_name = model_name
+    cfg.tensor_para_size = tp
+    cfg.rotary_embedding = cfg.size_per_head
+    cfg.group_size = group_size
+    if inferred_model_format.find('awq') != -1:
+        cfg.weight_type = 'int4'
+        output_format = 'w4'
+        assert group_size > 0, 'group_size should > 0'
+
+    # convert
+    print('model_name            ', model_name)
+    print('model_format          ', model_format)
+    print('inferred_model_format ', inferred_model_format)
+    print('model_path            ', model_path)
+    print('tokenizer_path        ', tokenizer_path)
+    print('output_format         ', output_format)
+    weight_path = osp.join(triton_models_path, 'weights')
+    input_model = INPUT_MODELS.get(inferred_model_format)(
+        model_path=model_path,
+        tokenizer_path=tokenizer_path,
+        ckpt_path=quant_path)
+    output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
+                                                    cfg=cfg,
+                                                    to_file=True,
+                                                    out_dir=weight_path)
+    output_model.export()
+
+    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
+    with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'),
+              'a') as f:
+        param = \
+            'parameters {\n  key: "tensor_para_size"\n  value: {\n    ' \
+            'string_value: ' + f'"{tp}"\n' + '  }\n}\n' + \
+            'parameters {\n  key: "model_name"\n  value: {\n    ' \
+            'string_value: ' + f'"{model_name}"\n' + '  }\n}\n'
+        f.write(param)
+
+    # pack model repository for triton inference server
+    pack_model_repository(dst_path)
+
+    # update the value of $TP in `service_docker_up.sh`
+    file_path = osp.join(dst_path, 'service_docker_up.sh')
+    with open(file_path, 'r') as f:
+        content = f.read()
+        content = re.sub('TP=1', f'TP={tp}', content)
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
new file mode 100644
index 0000000000..7c6627c770
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel  # noqa: F401
+from .llama import LlamaModel  # noqa: F401
+from .llama_awq import LlamaAwqModel  # noqa: F401
+from .meta_llama import MetaLlamaModel  # noqa: F401
+from .qwen import QwenModel  # noqa: F401
+from .qwen_awq import QwenAwqModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py
new file mode 100644
index 0000000000..46ccb6309d
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class BaichuanReader(LlamaReader):
+    """BaichuanReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+        """Get q, k, v, o kind for layer i."""
+        result = []
+        pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
+        qkv = self.params[pack_key]
+        result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim))
+        o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}']
+        result.append(o)
+        return (*result, )
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight', 0, 0)
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return (None, ) * 4
+
+
+class Baichuan2Reader(BaichuanReader):
+    """Baichuan2Reader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def output_weight(self):
+        """Get output."""
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+        tensor = self.params.get('lm_head.weight', None)
+        if tensor is not None:
+            tensor = tensor.cuda()
+            tensor = torch.nn.functional.normalize(tensor)
+        return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan')
+class BaichuanModel(LlamaModel):
+    """Llama model in baichuan format."""
+
+    Reader = BaichuanReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2')
+class Baichuan2Model(LlamaModel):
+    """Llama model in baichuan format."""
+
+    Reader = Baichuan2Reader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path, **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
new file mode 100644
index 0000000000..d5d60286a8
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+
+
+class BaichuanAwqReader(BaichuanReader):
+    """BaichuanAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+class Baichuan2AwqReader(BaichuanAwqReader):
+    """Baichuan2AwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def output_weight(self):
+        """Get output."""
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+        tensor = self.params.get('lm_head.weight', None)
+        if tensor is not None:
+            tensor = tensor.cuda()
+            tensor = torch.nn.functional.normalize(tensor)
+        return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan-awq')
+class BaichuanAwqModel(BaichuanModel):
+    """Baichuan awq model in hf format."""
+
+    Reader = BaichuanAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2-awq')
+class Baichuan2AwqModel(Baichuan2Model):
+    """Baichuan2 awq model in hf format."""
+
+    Reader = Baichuan2AwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
new file mode 100644
index 0000000000..89f18033e9
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, Tuple, Union
+
+import torch
+from mmengine import Registry
+
+INPUT_MODELS = Registry(
+    'source model', locations=['lmdeploy.turbomind.deploy.source_model.base'])
+
+
+class BaseReader(ABC):
+    """Base checkpoint manager."""
+
+    def __init__(self):
+        pass
+
+    @property
+    @abstractmethod
+    def start_layer_id(self) -> int:
+        """Get the start transformer layer number."""
+        pass
+
+    @property
+    @abstractmethod
+    def end_layer_id(self) -> int:
+        """Get the end transformer layer number."""
+        pass
+
+    @abstractmethod
+    def init_layer_id(self) -> None:
+        """Get start and end transformer layer number."""
+        self._start_layer_id = -1
+        self._end_layer_id = -1
+        layer_count = {}
+        for key in self.params:
+            layer_id = re.findall(self.attn_layer_patten, key)
+            if len(layer_id) == 0:
+                continue
+            layer_id = int(layer_id[0])
+            if layer_id not in layer_count:
+                layer_count[layer_id] = 0
+            layer_count[layer_id] += 1
+        if len(layer_count) == 0:
+            return
+        if not (len(layer_count) > 1 or self.last_bin):
+            return
+        max_count = max([layer_count[layer_id] for layer_id in layer_count])
+        valid_layer_id = [
+            layer_id for layer_id in layer_count
+            if layer_count[layer_id] == max_count
+        ]
+        self._start_layer_id = min(valid_layer_id)
+        self._end_layer_id = max(valid_layer_id) + 1
+
+    @abstractmethod
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        if last:
+            self.params.clear()
+        else:
+            to_remove = []
+            for key in self.params:
+                layer_id = re.findall(self.attn_layer_patten, key)
+                if len(layer_id) == 0:
+                    to_remove.append(key)
+                else:
+                    layer_id = int(layer_id[0])
+                    if layer_id < self.end_layer_id:
+                        to_remove.append(key)
+            for key in to_remove:
+                self.params.pop(key, None)
+        torch.cuda.empty_cache()
+
+    @abstractmethod
+    def tok_embeddings(self) -> Union[torch.Tensor, None]:
+        """Get embeddings."""
+        pass
+
+    @abstractmethod
+    def norm_weight(self) -> Union[torch.Tensor, None]:
+        """Get norm."""
+        pass
+
+    @abstractmethod
+    def output_weight(self) -> Union[torch.Tensor, None]:
+        """Get output."""
+        pass
+
+    @abstractmethod
+    def attn(self, i: int) -> Tuple[torch.Tensor]:
+        """Get q, k, v, o weight for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o bias for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o zero point for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o scale for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_norm(self, i: int) -> torch.Tensor:
+        """Get attn norm for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn(self, i: int) -> Tuple[torch.Tensor]:
+        """Get ffn weight for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get ffn zero point for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get ffn scale for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_norm(self, i: int) -> torch.Tensor:
+        """Get ffn norm for layer i."""
+        pass
+
+
+class BaseInputModel(ABC):
+    """Base class for input model."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        """Constructor for BaseInputModel.
+
+        Args:
+            model_path (str): the path of the model.
+            tokenizer_path (str): the path of the tokenizer model.
+        """
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+
+    @property
+    @abstractmethod
+    def nmgrs(self) -> int:
+        """Get number of checkpoint."""
+        pass
+
+    @abstractmethod
+    def get_mgrs(self) -> Iterator[BaseReader]:
+        """Conctruct all BaseReader."""
+        pass
+
+    @abstractmethod
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        pass
+
+    @abstractmethod
+    def model_info(self) -> Dict:
+        """Read model info."""
+        pass
+
+    def bins(self) -> Iterator[BaseReader]:
+        """Get Reader."""
+        for mgr in self.get_mgrs():
+            yield mgr
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
new file mode 100644
index 0000000000..f800260467
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+
+import torch
+from safetensors.torch import load_file
+
+from lmdeploy.tokenizer import Tokenizer
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+class LlamaReader(BaseReader):
+    """LlamaReader."""
+
+    attn_layer_patten = r'model.layers.([0-9]+).'
+    tok_embeddings_key = 'model.embed_tokens.weight'
+    norm_weight_key = 'model.norm.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__()
+        self.params = unused_params
+        self.params.update(new_params)
+        self.last_bin = last_bin
+        self.init_layer_id()
+
+    def init_layer_id(self):
+        """Get start/end transformer layer id."""
+        super().init_layer_id()
+
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        super().clean_up(last)
+
+    @property
+    def start_layer_id(self):
+        """Get start transformer layer id."""
+        return self._start_layer_id
+
+    @property
+    def end_layer_id(self):
+        """Get end transformer layer id."""
+        return self._end_layer_id
+
+    def tok_embeddings(self):
+        """Get embeddings."""
+        return self.params.get(self.tok_embeddings_key, None)
+
+    def norm_weight(self):
+        """Get norm."""
+        return self.params.get(self.norm_weight_key, None)
+
+    def output_weight(self):
+        """Get output."""
+        return self.params.get(self.output_weight_key, None)
+
+    def _attn(self, i: int, kind: str, allow_none=False):
+        """Get q, k, v, o kind for layer i."""
+        result = []
+        for key in ['q', 'k', 'v', 'o']:
+            tensor = self.params.get(
+                f'model.layers.{i}.self_attn.{key}_proj.{kind}')
+            if not allow_none:
+                assert tensor is not None
+            result.append(tensor)
+        return (*result, )
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight')
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return self._attn(i, 'bias', allow_none=True)
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'model.layers.{i}.input_layernorm.weight']
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[f'model.layers.{i}.mlp.{key}_proj.{kind}']
+            result.append(tensor)
+        return (*result, )
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        return self._ffn(i, 'weight')
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'model.layers.{i}.post_attention_layernorm.weight']
+
+
+@INPUT_MODELS.register_module(name='hf')
+class LlamaModel(BaseInputModel):
+    """Llama model in hf format."""
+
+    Reader = LlamaReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path)
+        ckpt_path = kwargs.get('ckpt_path')
+        if ckpt_path is None:
+            ckpt_path = model_path
+        self.ckpt_path = ckpt_path
+        self.ckpt_files = self.get_ckpt()
+
+    def get_ckpt(self):
+        """Get weight files."""
+        suffixes = ['.safetensors', '.bin']
+        files = []
+        for suffix in suffixes:
+            files = [
+                file for file in os.listdir(self.ckpt_path)
+                if file.endswith(suffix)
+            ]
+            if len(files) > 0:
+                break
+        files = sorted(files)
+        return files
+
+    @property
+    def nmgrs(self):
+        """Get number of checkpoint."""
+        return len(self.ckpt_files)
+
+    def get_mgrs(self):
+        """Conctruct all Reader."""
+        assert self.nmgrs > 0, \
+            f'could not find checkpoints in {self.ckpt_path}'
+        unused_params = {}
+        try:
+            for i, ckpt in enumerate(self.ckpt_files):
+                is_last_bin = i == len(self.ckpt_files) - 1
+                if ckpt.endswith('.bin'):
+                    new_params = torch.load(osp.join(self.ckpt_path, ckpt),
+                                            map_location='cpu')
+                else:
+                    new_params = load_file(osp.join(self.ckpt_path, ckpt))
+                ret = self.Reader(new_params, unused_params,
+                                  i == self.nmgrs - 1)
+                yield ret
+                ret.clean_up(is_last_bin)
+        except GeneratorExit:
+            ret.clean_up(True)
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        assert osp.isdir(self.model_path), self.model_path
+        tk_model = Tokenizer(self.model_path)
+        n_words = tk_model.vocab_size
+        bos_id = tk_model.bos_token_id
+        eos_id = tk_model.eos_token_id
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)
+            num_layer = model_arg['num_hidden_layers']
+            norm_eps = model_arg['rms_norm_eps']
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg['num_key_value_heads']
+            else:
+                kv_head_num = model_arg['num_attention_heads']
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            repo_scaling = bool(model_arg.get('rope_scaling', False))
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=int(repo_scaling))
diff --git a/lmdeploy/turbomind/deploy/source_model/llama_awq.py b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
new file mode 100644
index 0000000000..9d2ae8ac50
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+def ensure_fp16orint32(tensors: torch.Tensor):
+    """Ensure tensors in fp16/int32 format."""
+    result = []
+    for tensor in tensors:
+        if tensor is not None:
+            if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]:
+                result.append(tensor.half())
+            else:
+                assert tensor.dtype == torch.int32
+                result.append(tensor)
+        else:
+            result.append(None)
+    return (*result, )
+
+
+class LlamaAwqReader(LlamaReader):
+    """LlamaAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight'))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros'))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales'))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='hf-awq')
+class LlamaAwqModel(LlamaModel):
+    """Llama Awq model in hf format."""
+
+    Reader = LlamaAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/meta_llama.py b/lmdeploy/turbomind/deploy/source_model/meta_llama.py
new file mode 100644
index 0000000000..bc26361c73
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/meta_llama.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import torch
+from sentencepiece import SentencePieceProcessor
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+def reverse_permute(x: torch.Tensor, size_per_head: int = 128):
+    """reverse permute to hf format."""
+    if x.shape[-1] > 1:
+        dim = x.shape[-1]
+        n_heads = dim // size_per_head
+        return x.view(-1, n_heads, dim // n_heads // 2,
+                      2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // size_per_head
+        return x.view(n_heads, dim // n_heads // 2, 2,
+                      1).transpose(1, 2).reshape(dim, 1)
+
+
+class MetaLlamaReader(BaseReader):
+    """MetaLlamaReader."""
+
+    def __init__(self, model_path: str, start_layer_id: int,
+                 end_layer_id: int):
+        super().__init__()
+        self._start_layer_id = start_layer_id
+        self._end_layer_id = end_layer_id
+        self.params = self.load_model(model_path)
+
+    def init_layer_id(self):
+        """Empty."""
+        pass
+
+    def load_model(self, model_path):
+        """Load all parameters."""
+        checkpoints = []
+        for pattern in ['*.pth', '*.pt']:
+            checkpoints += sorted(Path(model_path).glob(pattern))
+        n_ckpt = len(checkpoints)
+        model_params = {}
+
+        def get_param(_name, _size):
+            if _name not in model_params:
+                model_params[_name] = torch.zeros(_size,
+                                                  dtype=torch.float16,
+                                                  device='cpu')
+            return model_params[_name]
+
+        from tqdm import tqdm
+        pbar = tqdm(total=n_ckpt, desc='load meta ckpt', leave=False)
+        for i, ckpt_path in enumerate(checkpoints):
+            ckpt = torch.load(ckpt_path, map_location='cpu')
+            for param_name, param_data in ckpt.items():
+                key, ext = param_name.split('.')[-2:]
+                # column-parallel
+                if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
+                    size = param_data.size(0)
+                    if ext == 'weight':
+                        param = get_param(
+                            param_name,
+                            [size * n_ckpt, param_data.size(1)])
+                        param.data[size * i:size * (i + 1), :] = param_data
+                    else:  # bias
+                        param = get_param(param_name, [size * n_ckpt])
+                        param.data[size * i:size * (i + 1)] = param_data
+                # row-parallel
+                elif key in ['w2', 'wo', 'tok_embeddings']:
+                    size = param_data.size(-1)
+                    if ext == 'weight':
+                        param = get_param(param_name,
+                                          [param_data.size(0), size * n_ckpt])
+                        param.data[:, size * i:size * (i + 1)] = param_data
+                    else:  # bias
+                        param = get_param(param_name, [size])
+                        param.data = param_data
+                elif i == 0:
+                    param = get_param(param_name, param_data.size())
+                    param.data = param_data
+            del ckpt
+            pbar.update(1)
+        pbar.close()
+
+        for name, param in model_params.items():
+            # transpose all weights as TurboMind is expecting column-major
+            # (output_dims, input_dims) -> (input_dims, output_dims)
+            key = name.split('.')[-2]
+            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
+                param.data = param.data.t()
+                if key in ['wq', 'wk']:
+                    param.data = reverse_permute(param.data)
+        return model_params
+
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        self.params.clear()
+
+    @property
+    def start_layer_id(self):
+        """Get start transformer layer id."""
+        return self._start_layer_id
+
+    @property
+    def end_layer_id(self):
+        """Get end transformer layer id."""
+        return self._end_layer_id
+
+    def tok_embeddings(self):
+        """Get embeddings."""
+        return self.params.get('tok_embeddings.weight')
+
+    def norm_weight(self):
+        """Get norm."""
+        return self.params.get('norm.weight')
+
+    def output_weight(self):
+        """Get output."""
+        return self.params.get('output.weight')
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        result = []
+        for key in ['wq', 'wk', 'wv', 'wo']:
+            tensor = self.params[f'layers.{i}.attention.{key}.weight']
+            tensor = tensor.t() if tensor is not None else None
+            result.append(tensor)
+        return (*result, )
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        result = []
+        for key in ['wq', 'wk', 'wv', 'wo']:
+            tensor = self.params.get(f'layers.{i}.attention.{key}.bias')
+            tensor = tensor.t() if tensor is not None else None
+            result.append(tensor)
+        return (*result, )
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'layers.{i}.attention_norm.weight']
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        result = []
+        for key in ['w1', 'w2', 'w3']:
+            tensor = self.params[f'layers.{i}.feed_forward.{key}.weight']
+            result.append(tensor.t())
+        return (*result, )
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'layers.{i}.ffn_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='llama')
+class MetaLlamaModel(BaseInputModel):
+    """Llama model in fb format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+    @property
+    def nmgrs(self):
+        """Get number of checkpoint."""
+        return 1
+
+    def get_mgrs(self):
+        """Conctruct all BaseReader."""
+        end_layer_id = self.model_info()['num_layer']
+        try:
+            if hasattr(self, 'meta_reader'):
+                yield self.meta_reader
+            else:
+                self.meta_reader = MetaLlamaReader(self.model_path, 0,
+                                                   end_layer_id)
+                yield self.meta_reader
+        except GeneratorExit:
+            pass
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        assert osp.isfile(self.tokenizer_path), self.tokenizer_path
+        sp_model = SentencePieceProcessor(model_file=self.tokenizer_path)
+        # BOS / EOS token IDs
+        n_words = sp_model.vocab_size()
+        bos_id = sp_model.bos_id()
+        eos_id = sp_model.eos_id()
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'params.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)
+            num_layer = model_arg['n_layers']
+            norm_eps = model_arg['norm_eps']
+            head_num = model_arg.get('n_heads', 32)
+            kv_head_num = model_arg.get('n_kv_heads', head_num)
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=head_num,
+                    kv_head_num=kv_head_num)
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
new file mode 100644
index 0000000000..09ff93afc5
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class QwenReader(LlamaReader):
+    """QwenReader."""
+
+    attn_layer_patten = r'transformer.h.([0-9]+).'
+    tok_embeddings_key = 'transformer.wte.weight'
+    norm_weight_key = 'transformer.ln_f.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+        """Get q, k, v, o kind for layer i."""
+        qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
+        q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
+        o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
+        if o is None:
+            o = torch.zeros_like(q)
+        return q, k, v, o
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight', 0, 0)
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return self._attn(i, 'bias', -1, 0)
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'transformer.h.{i}.ln_1.weight']
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        result = []
+        for key in ['w2', 'c_proj', 'w1']:
+            tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
+            result.append(tensor)
+        return (*result, )
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        return self._ffn(i, 'weight')
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'transformer.h.{i}.ln_2.weight']
+
+
+@INPUT_MODELS.register_module(name='qwen')
+class QwenModel(LlamaModel):
+    """Qwen model in hf format."""
+
+    Reader = QwenReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        n_words = 151851
+        bos_id = 0
+        eos_id = 151643
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            config = json.load(f)
+            num_layer = config['num_hidden_layers']
+            norm_eps = config['layer_norm_epsilon']
+            rope_theta = float(config.get('rotary_emb_base', 10000.0))
+            if 'num_key_value_heads' in config:
+                kv_head_num = config['num_key_value_heads']
+            else:
+                kv_head_num = config['num_attention_heads']
+            seq_length = config['seq_length']
+            use_dynamic_ntk = int(config['use_dynamic_ntk'])
+            use_logn_attn = int(config['use_logn_attn'])
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=seq_length,
+                    use_dynamic_ntk=int(use_dynamic_ntk),
+                    use_logn_attn=use_logn_attn)
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
new file mode 100644
index 0000000000..04df2ac729
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+from .qwen import QwenModel, QwenReader
+
+
+class QwenAwqReader(QwenReader):
+    """QwenAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'bias', -1, 0))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        # ours: w2(silu(w1(x)) * w3(x))
+        # qwen: c_proj(w1(x) * silu(w2(x)))
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='qwen-awq')
+class QwenAwqModel(QwenModel):
+    """Qwen awq model in hf format."""
+
+    Reader = QwenAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py
new file mode 100644
index 0000000000..fe03500e45
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .fp import TurbomindModel  # noqa: F401
+from .w4 import TurbomindW4Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
new file mode 100644
index 0000000000..5e9b5341f7
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import configparser
+import inspect
+import os.path as osp
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+import tqdm
+from mmengine import Registry
+
+from lmdeploy.model import MODELS
+
+from ..source_model.base import BaseInputModel, BaseReader
+
+OUTPUT_MODELS = Registry(
+    'target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
+
+
+def tprint(*args, **kwargs):
+    from io import StringIO
+    s = StringIO()
+    print(*args, **kwargs, file=s, end='')
+    tqdm.tqdm.write(s.getvalue())
+
+
+@dataclass
+class TurbomindModelConfig:
+    """Config for turbomind model."""
+    model_name: str
+    tensor_para_size: int
+    head_num: int
+    kv_head_num: int
+    vocab_size: int
+    num_layer: int
+    inter_size: int
+    norm_eps: float
+    attn_bias: int
+    start_id: int
+    end_id: int
+    session_len: int
+    weight_type: str = 'fp16'
+    rotary_embedding: int = 128
+    rope_theta: float = 10000.0
+    size_per_head: int = 128
+    group_size: int = 0
+    max_batch_size: int = 32
+    max_context_token_num: int = 4
+    step_length: int = 1
+    cache_max_entry_count: int = 48
+    cache_chunk_size: int = 1
+    use_context_fmha: int = 1
+    quant_policy: int = 0
+    max_position_embeddings: int = 0
+    use_dynamic_ntk: int = 0
+    use_logn_attn: int = 0
+
+    @classmethod
+    def from_dict(cls, env, allow_none=False):
+        """Construct from dict."""
+        params = inspect.signature(cls).parameters
+        used = {k: v for k, v in env.items() if k in params and v is not None}
+        if not allow_none:
+            return cls(**used)
+        else:
+            default = {
+                k: None
+                for k in params.keys() if params[k].default is inspect._empty
+            }
+            default.update(used)
+            return cls(**default)
+
+    @property
+    def valid(self):
+        """Check if cfg is valid."""
+        for _, v in self.__dict__.items():
+            if v is None:
+                return False
+        return True
+
+
+class BaseOutputModel(ABC):
+    """Base output model."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__()
+        self.input_model = input_model
+        self.cfg = self.get_config(cfg)
+        assert self.cfg.valid
+        self.to_file = to_file
+        self.out_dir = out_dir
+
+    @abstractmethod
+    def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
+        """Generate turbomind model config (config.ini)."""
+        _, bos_id, eos_id = self.input_model.tokenizer_info()
+        model = MODELS.get(cfg.model_name)()
+        final_cfg = cfg.__dict__
+        final_cfg.update(
+            dict(start_id=bos_id,
+                 end_id=eos_id,
+                 session_len=model.session_len + 8))
+        final_cfg.update(self.input_model.model_info())
+
+        # head_num, vocab_size
+        for bin in self.input_model.bins():
+            emb = bin.tok_embeddings()
+            if emb is not None:
+                _vocab_size, dim = emb.shape
+                head_num = dim // cfg.size_per_head
+                break
+        final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size))
+        return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
+
+    def export_config(self) -> None:
+        """export turbomind config."""
+        if self.to_file:
+            config = configparser.ConfigParser()
+            cfg = dict(llama=self.cfg.__dict__)
+            for section, key_values in cfg.items():
+                config[section] = key_values
+            config_path = osp.join(self.out_dir, 'config.ini')
+            with open(config_path, 'w') as f:
+                config.write(f)
+
+    def export_weight(self, param: torch.Tensor, name: str) -> None:
+        """export turbomind weight."""
+        if self.to_file:
+            if param.dtype in [torch.float, torch.bfloat16]:
+                param = param.half()
+            tprint(name, param.shape)
+            param.contiguous().cpu().numpy().tofile(
+                osp.join(self.out_dir, name))
+
+    def save_split(self,
+                   tensor: torch.Tensor,
+                   name: str,
+                   split_dim=None,
+                   copy=False) -> None:
+        """save split."""
+        tp = self.cfg.tensor_para_size
+        if split_dim is not None:
+            tprint(f'*** splitting {name}, shape={tensor.shape}, '
+                   f'split_dim={split_dim}, tp={tp}')
+            assert tensor.shape[split_dim] % tp == 0
+            split_size = tensor.shape[split_dim] // tp
+            splits = torch.split(tensor, split_size, dim=split_dim)
+            for i, split in enumerate(splits):
+                prefix, ext = osp.splitext(name)
+                self.export_weight(split, f'{prefix}.{i}{ext}')
+        elif copy:
+            tprint(f'### copying {name}, shape={tensor.shape}')
+            copies = [tensor] * tp
+            for i, copy in enumerate(copies):
+                prefix, ext = osp.splitext(name)
+                self.export_weight(copy, f'{prefix}.{i}{ext}')
+        else:
+            self.export_weight(tensor, name)
+
+    def export(self) -> None:
+        """Export to turbomind model format."""
+        num_layer = self.cfg.num_layer
+        from tqdm import tqdm
+        pbar = tqdm(total=num_layer, desc='Convert to turbomind format')
+        self.export_config()
+        for bin in self.input_model.bins():
+            self.export_misc(bin)
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                self.export_transformer_block(bin, i)
+                pbar.update(1)
+        pbar.close()
+        # manually clean up meta reader
+        if hasattr(self.input_model, 'meta_reader'):
+            self.input_model.meta_reader.clean_up(True)
+            del self.input_model.meta_reader
+            torch.cuda.empty_cache()
+
+    def export_misc(self, bin: BaseReader) -> None:
+        """Export embedding, norm, output weight."""
+        emb = bin.tok_embeddings()
+        norm_weight = bin.norm_weight()
+        output_weight = bin.output_weight()
+
+        def pad_weight(tensor):
+            pad_size = None
+            vocab_size = self.cfg.vocab_size
+            tp = self.cfg.tensor_para_size
+            if vocab_size % tp != 0:
+                pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
+
+            if pad_size is None:
+                return tensor
+            return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
+                                           'constant', 0)
+
+        if emb is not None:
+            emb = pad_weight(emb)
+            self.export_weight(emb, 'tok_embeddings.weight')
+        if norm_weight is not None:
+            self.export_weight(norm_weight, 'norm.weight')
+        if output_weight is not None:
+            output_weight = pad_weight(output_weight)
+            self.export_weight(output_weight, 'output.weight')
+
+    @abstractmethod
+    def export_transformer_block(self, bin: BaseReader, i: int) -> None:
+        """Export transformer block."""
+        pass
+
+
+def permute(x: torch.Tensor, size_per_head: int = 128):
+    if x.shape[-1] > 1:
+        dim = x.shape[-1]
+        n_heads = dim // size_per_head
+        return x.view(-1, n_heads, 2,
+                      dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // size_per_head
+        return x.view(n_heads, 2, dim // n_heads // 2,
+                      1).transpose(1, 2).reshape(dim, 1)
+
+
+def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
+              dim: int):
+
+    def reshape(x):
+        return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
+
+    qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
+    # (input_dim, head_num + 2 * kv_head_num)
+    return qkv.view(q.size(0), -1)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
new file mode 100644
index 0000000000..d9a7783436
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+                   merge_qkv, permute)
+
+
+def transpose_tensor(input: List[torch.Tensor]):
+    """Transpose tensor."""
+    output = [x.cuda().t() for x in input]
+    return output
+
+
+@OUTPUT_MODELS.register_module(name='fp16')
+class TurbomindModel(BaseOutputModel):
+    """Export to turbomind fp16 format."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__(input_model, cfg, to_file, out_dir)
+
+    def get_config(self, cfg: TurbomindModelConfig):
+        """Get turbomind config."""
+        final_cfg = super().get_config(cfg).__dict__
+
+        # attn_bias, inter_size
+        visit = False
+        attn_bias = 0
+        for bin in self.input_model.bins():
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                visit = True
+                w1, _, _ = bin.ffn(i)
+                inter_size = w1.t().shape[-1]
+                qb, _, _, _ = bin.attn_bias(i)
+                if qb is not None:
+                    attn_bias = 1
+                break
+            if visit:
+                break
+        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+        return TurbomindModelConfig.from_dict(final_cfg)
+
+    def export_transformer_block(self, bin: BaseReader, i: int):
+        """Export transformer layer i."""
+        assert bin.start_layer_id <= i < bin.end_layer_id
+        tp = self.cfg.tensor_para_size
+        size_per_head = self.cfg.size_per_head
+        # attn
+        qw, kw, vw, ow = bin.attn(i)
+        qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
+        qw = permute(qw, size_per_head)
+        kw = permute(kw, size_per_head)
+        qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
+        self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
+        self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
+        qb, kb, vb, ob = bin.attn_bias(i)
+        if qb is not None:
+            qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
+            qb = permute(qb, size_per_head)
+            kb = permute(kb, size_per_head)
+            qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
+            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+            self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
+        # ffn
+        w1, w2, w3 = bin.ffn(i)
+        w1, w2, w3 = transpose_tensor([w1, w2, w3])
+        self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
+        self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
+        self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
+        # norm
+        attn_norm = bin.attn_norm(i)
+        ffn_norm = bin.ffn_norm(i)
+        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/deploy/target_model/w4.py b/lmdeploy/turbomind/deploy/target_model/w4.py
new file mode 100644
index 0000000000..282c7df607
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/w4.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import sys
+
+import torch
+
+import lmdeploy
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+                   merge_qkv, permute)
+
+# import _turbomind as _tm
+# TODO: find another way import _turbomind
+lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
+sys.path.append(osp.join(lmdeploy_dir, 'lib'))
+import _turbomind as _tm  # noqa: E402
+
+
+def transpose_qk_s4(src: torch.Tensor, group_size):
+    assert src.is_contiguous()
+    dst = torch.zeros_like(src)
+    _tm.transpose_qk_s4_k_m8(src, dst,
+                             src.size(-1) * 8, src.size(0), group_size)
+    return dst
+
+
+def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor,
+                  w3_qw: torch.Tensor, w3_qz: torch.Tensor,
+                  w3_s: torch.Tensor):
+
+    def fuse(a: torch.Tensor, b: torch.Tensor):
+        ab = torch.cat((a, b)).contiguous()
+        _ab = torch.zeros_like(ab)
+        _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
+        return _ab.view(a.size(0), -1)
+
+    w13_qw = fuse(w1_qw, w3_qw)
+    w13_qz = fuse(w1_qz, w3_qz)
+
+    w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
+    w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
+
+    return w13_qw, w13_qz, w13_s
+
+
+def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
+               group_size: int):
+    assert qw.is_contiguous()
+    assert qz.is_contiguous()
+    assert s.is_contiguous()
+    _qw = torch.zeros_like(qw)
+    _sz = torch.zeros_like(s, dtype=torch.int32)  # half2
+    _ws = torch.zeros_like(s)
+    _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
+                        qw.size(-1) * 8, qw.size(0), group_size)
+    return _qw, _sz
+
+
+def tp_m_s4(x: torch.Tensor, tp: int):
+    return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
+                                                        1).contiguous()
+
+
+def get_cuda_tensor(tensors):
+    """Get cuda tensor."""
+    result = map(lambda x: x.cuda() if x is not None else x, tensors)
+    return (*result, )
+
+
+@OUTPUT_MODELS.register_module(name='w4')
+class TurbomindW4Model(BaseOutputModel):
+    """Export to turbomind w4a16 format."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__(input_model, cfg, to_file, out_dir)
+
+    def get_config(self, cfg: TurbomindModelConfig):
+        """Get turbomind config."""
+        final_cfg = super().get_config(cfg).__dict__
+
+        # attn_bias, inter_size
+        visit = False
+        attn_bias = 0
+        for bin in self.input_model.bins():
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                visit = True
+                w1s, _, _ = bin.ffn_scale(i)
+                inter_size = w1s.shape[-1]
+                qb, _, _, _ = bin.attn_bias(i)
+                if qb is not None:
+                    attn_bias = 1
+                break
+            if visit:
+                break
+        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+        return TurbomindModelConfig.from_dict(final_cfg)
+
+    def export_transformer_block(self, bin: BaseReader, i: int):
+        """Export transformer layer i."""
+        group_size = self.cfg.group_size
+        tp = self.cfg.tensor_para_size
+        size_per_head = self.cfg.size_per_head
+        # attn
+        q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
+        q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
+        q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
+
+        q_qw = transpose_qk_s4(q_qw, group_size)
+        k_qw = transpose_qk_s4(k_qw, group_size)
+        q_qz = transpose_qk_s4(q_qz, group_size)
+        k_qz = transpose_qk_s4(k_qz, group_size)
+        q_s = permute(q_s, size_per_head)
+        k_s = permute(k_s, size_per_head)
+
+        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
+        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
+        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
+
+        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
+        qkv_qw = tp_m_s4(qkv_qw, tp)
+        self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
+        self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
+
+        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
+        self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
+        self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
+
+        q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
+        if q_b is not None:
+            q_b = permute(q_b, size_per_head)
+            k_b = permute(k_b, size_per_head)
+            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
+            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+            self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
+
+        # ffn weights
+        w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
+        w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
+        w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
+
+        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
+                                              w3_s)
+        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
+        w13_qw = tp_m_s4(w13_qw, tp)
+        self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
+        self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
+                        -1)
+
+        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
+        self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
+        self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
+
+        # norm
+        attn_norm = bin.attn_norm(i)
+        ffn_norm = bin.ffn_norm(i)
+        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 328f182158..9a4f0e8c4d 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -2,8 +2,6 @@
 
 import subprocess
 
-import fire
-
 
 def get_llama_gemm():
     import os.path as osp
@@ -30,4 +28,6 @@ def main(head_num: int = 32,
 
 
 if __name__ == '__main__':
+    import fire
+
     fire.Fire(main)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index dcfc499e89..9d2186fea9 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -13,7 +13,7 @@
 from torch.nn.utils.rnn import pad_sequence
 
 import lmdeploy
-from lmdeploy.model import MODELS
+from lmdeploy.model import MODELS, BaseModel
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
 
@@ -78,7 +78,11 @@ class TurboMind:
         tp (int): tensor parallel
     """
 
-    def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
+    def __init__(self,
+                 model_path: str,
+                 eos_id: int = 2,
+                 tp: int = 1,
+                 **kwargs):
         self.eos_id = eos_id
 
         # TODO: support mpi
@@ -88,7 +92,6 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
         # read meta from model path
         assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
         self.gpu_count = tp
-        self.session_len = 2048
         data_type = 'fp16'
         ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
         with open(ini_path, 'r') as f:
@@ -102,18 +105,18 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
 
             if len(section_name) > 0:
                 tp_cfg = parser.getint(section_name, 'tensor_para_size')
-                self.session_len = parser.getint(section_name, 'session_len')
                 if tp_cfg != 1 and tp_cfg != tp:
                     get_logger('turbomind').info(
                         f'found tp={tp_cfg} in config.ini.')
                     self.gpu_count = tp_cfg
             self.model_name = parser.get(section_name, 'model_name')
             data_type = parser.get(section_name, 'weight_type')
-        model = MODELS.get(self.model_name)()
+        self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
+        self.session_len = self.model.session_len
         tokenizer_model_path = osp.join(model_path, 'triton_models',
                                         'tokenizer')
         tokenizer = Tokenizer(tokenizer_model_path)
-        self.stop_words = _stop_words(model.stop_words, tokenizer)
+        self.stop_words = _stop_words(self.model.stop_words, tokenizer)
 
         # params
         self.node_id = node_id
@@ -122,17 +125,17 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
 
         # create model
         weight_dir = osp.join(model_path, 'triton_models', 'weights')
-        model = _tm.AbstractTransformerModel.create_llama_model(
+        model_comm = _tm.AbstractTransformerModel.create_llama_model(
             weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
-        self.model = model
-        self.nccl_params = model.create_nccl_params(self.node_id)
+        self.model_comm = model_comm
+        self.nccl_params = model_comm.create_nccl_params(self.node_id)
         torch.cuda.synchronize()
 
         # create weight
         def _create_weight(device_id):
             with cuda_ctx(device_id):
                 rank = self.node_id * self.gpu_count + device_id
-                model.create_shared_weights(device_id, rank)
+                model_comm.create_shared_weights(device_id, rank)
 
         threads = []
         for device_id in range(self.gpu_count):
@@ -161,7 +164,7 @@ class TurboMindInstance:
         cuda_stream_id(int): identity of a cuda stream
     """
 
-    def __init__(self, tm_model, cuda_stream_id=0):
+    def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
         self.tm_model = tm_model
         self.cuda_stream_id = cuda_stream_id
 
@@ -175,7 +178,7 @@ def __init__(self, tm_model, cuda_stream_id=0):
         self.session_len = tm_model.session_len
 
         self.nccl_params = tm_model.nccl_params
-        self.instance_comm = tm_model.model.create_instance_comm(
+        self.instance_comm = tm_model.model_comm.create_instance_comm(
             self.gpu_count)
 
         # create model instances
@@ -196,7 +199,7 @@ def __init__(self, tm_model, cuda_stream_id=0):
     def _create_model_instance(self, device_id, model_insts):
         with cuda_ctx(device_id):
             rank = self.node_id * self.gpu_count + device_id
-            model_inst = self.tm_model.model.create_model_instance(
+            model_inst = self.tm_model.model_comm.create_model_instance(
                 device_id, rank, self.cuda_stream_id, self.nccl_params)
             model_insts[device_id] = model_inst
 
@@ -266,7 +269,7 @@ def stream_infer(self,
             self.model_insts[0].register_callback(self._forward_callback)
 
         if len(input_ids) == 0:
-            input_ids = []
+            input_ids = [[]]
         if isinstance(input_ids[0], int):
             input_ids = [input_ids]
 
@@ -381,7 +384,7 @@ def decode(self, input_ids):
         """
 
         if len(input_ids) == 0:
-            input_ids = []
+            input_ids = [[]]
         if isinstance(input_ids[0], int):
             input_ids = [input_ids]
 
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 417dc76768..0bd4914cc4 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
 
-__version__ = '0.0.11'
+__version__ = '0.0.14'
 short_version = __version__
 
 
diff --git a/requirements.txt b/requirements.txt
index 9eacb498fb..27049672c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ accelerate
 datasets
 fastapi
 fire
-gradio
+gradio<4.0.0
 mmengine
 numpy
 pybind11
diff --git a/setup.py b/setup.py
index 09ae1e31c2..df36118c23 100644
--- a/setup.py
+++ b/setup.py
@@ -121,26 +121,29 @@ def gen_packages_items():
 
 if __name__ == '__main__':
     lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
-    setup(name='lmdeploy',
-          version=get_version(),
-          description='A toolset for compressing, deploying and serving LLM',
-          long_description=readme(),
-          long_description_content_type='text/markdown',
-          author='OpenMMLab',
-          author_email='openmmlab@gmail.com',
-          packages=find_packages(exclude=()),
-          package_data={
-              'lmdeploy': lmdeploy_package_data,
-          },
-          include_package_data=True,
-          install_requires=parse_requirements('requirements.txt'),
-          has_ext_modules=check_ext_modules,
-          classifiers=[
-              'Programming Language :: Python :: 3.8',
-              'Programming Language :: Python :: 3.9',
-              'Programming Language :: Python :: 3.10',
-              'Programming Language :: Python :: 3.11',
-              'Intended Audience :: Developers',
-              'Intended Audience :: Education',
-              'Intended Audience :: Science/Research',
-          ])
+    setup(
+        name='lmdeploy',
+        version=get_version(),
+        description='A toolset for compressing, deploying and serving LLM',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        author='OpenMMLab',
+        author_email='openmmlab@gmail.com',
+        packages=find_packages(exclude=()),
+        package_data={
+            'lmdeploy': lmdeploy_package_data,
+        },
+        include_package_data=True,
+        install_requires=parse_requirements('requirements.txt'),
+        has_ext_modules=check_ext_modules,
+        classifiers=[
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
+            'Programming Language :: Python :: 3.11',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Education',
+            'Intended Audience :: Science/Research',
+        ],
+        entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']},
+    )
diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py
new file mode 100644
index 0000000000..a41eab442e
--- /dev/null
+++ b/tests/test_lmdeploy/test_cli.py
@@ -0,0 +1,51 @@
+import inspect
+
+
+def compare_func(class_method, function):
+    """Compare if a class method has same arguments as a function."""
+
+    argspec_cls = inspect.getfullargspec(class_method)
+    argspec_func = inspect.getfullargspec(function)
+    assert argspec_cls.args[1:] == argspec_func.args
+    assert argspec_cls.defaults == argspec_func.defaults
+    assert argspec_cls.annotations == argspec_func.annotations
+
+
+def test_cli():
+
+    from lmdeploy.cli.cli import CLI
+    from lmdeploy.serve.turbomind.deploy import main as convert
+    compare_func(CLI.convert, convert)
+
+
+def test_subcli_chat():
+    from lmdeploy.cli.chat import SubCliChat
+    from lmdeploy.pytorch.chat import main as run_torch_model
+    from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+    compare_func(SubCliChat.torch, run_torch_model)
+    compare_func(SubCliChat.turbomind, run_turbomind_model)
+
+
+def test_subcli_lite():
+    from lmdeploy.cli.lite import SubCliLite
+    from lmdeploy.lite.apis.auto_awq import auto_awq
+    from lmdeploy.lite.apis.calibrate import calibrate
+    from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+    compare_func(SubCliLite.auto_awq, auto_awq)
+    compare_func(SubCliLite.calibrate, calibrate)
+    compare_func(SubCliLite.kv_qparams, run_kv_qparams)
+
+
+def test_subcli_serve():
+    from lmdeploy.cli.serve import SubCliServe
+    from lmdeploy.serve.client import main as run_triton_client
+    from lmdeploy.serve.gradio.app import run as run_gradio
+    from lmdeploy.serve.openai.api_client import main as run_api_client
+    from lmdeploy.serve.openai.api_server import main as run_api_server
+
+    compare_func(SubCliServe.gradio, run_gradio)
+    compare_func(SubCliServe.api_server, run_api_server)
+    compare_func(SubCliServe.api_client, run_api_client)
+    compare_func(SubCliServe.triton_client, run_triton_client)
diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py
new file mode 100644
index 0000000000..ff7d8047b2
--- /dev/null
+++ b/tests/test_lmdeploy/test_tokenizer.py
@@ -0,0 +1,24 @@
+import pytest
+
+from lmdeploy.tokenizer import HuggingFaceTokenizer
+
+
+@pytest.mark.parametrize('model_path', [
+    'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
+    'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf',
+    'upstage/SOLAR-0-70b-16bit'
+])
+@pytest.mark.parametrize(
+    'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5])
+def test_tokenizer(model_path, input):
+    tokenizer = HuggingFaceTokenizer(model_path)
+    encoded = tokenizer.encode(input)
+    output = ''
+    offset = 0
+    for i in range(1, len(encoded) + 1):
+        decoded = tokenizer.decode(encoded[:i], offset)
+        if decoded.endswith('�'):
+            continue
+        output += decoded
+        offset = i
+    assert input == output, 'input string should equal to output after enc-dec'