diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
index 86838836de..d9e6956735 100644
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -25,6 +25,18 @@ body:
A placeholder for the command.
validations:
required: true
+- type: textarea
+ attributes:
+ label: Environment
+ description: |
+ 1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here.
+ 2. You may add addition that may be helpful for locating the problem, such as
+ - How you installed PyTorch \[e.g., pip, conda, source\]
+ - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+ placeholder: Environment here.
+ render: Shell
+ validations:
+ required: true
- type: textarea
attributes:
label: Error traceback
diff --git a/README.md b/README.md
index a2de4d6ac0..7639675aba 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
## Supported Models
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`.
+`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
### TurboMind
@@ -63,6 +63,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
+| SOLAR | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | No | No |
@@ -118,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
GIT_LFS_SKIP_SMUDGE=1
# 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
#### Inference by TurboMind
```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
> **Note**
@@ -139,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
#### Serving with gradio
```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
Launch inference server by:
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
```
Then, you can communicate with it by command line,
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
```
or webui,
```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh
Then, you can communicate with the inference server by command line,
```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
```
or webui,
```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
```
For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -199,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
#### Single GPU
```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 09c66c2826..38faad0583 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -53,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
## 支持的模型
-`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端
+`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表
### TurboMind
@@ -64,6 +64,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
| :----------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
+| SOLAR | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | No | No |
@@ -119,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
GIT_LFS_SKIP_SMUDGE=1
# 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
#### 使用 turbomind 推理
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
> **Note**
@@ -139,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
#### 启动 gradio server
```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
使用下面的命令启动推理服务:
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
你可以通过命令行方式与推理服务进行对话:
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
```
也可以通过 WebUI 方式来对话:
```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh
你可以通过命令行方式与推理服务进行对话:
```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
```
也可以通过 WebUI 方式来对话:
```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
```
其他模型的部署方式,比如 LLaMA,LLaMA-2,vicuna等等,请参考[这里](docs/zh_cn/serving.md)
@@ -203,7 +204,7 @@ pip install deepspeed
#### 单个 GPU
```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
diff --git a/benchmark/README.md b/benchmark/README.md
index b5573ae2b8..3fa117210e 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
```bash
python profile_generation.py \
--model-path /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
```
## profile serving
diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
index e64a6708cd..325877f4e3 100644
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -106,7 +106,7 @@ def _infer(model, session_id):
def profile_throughput(model_path: str,
concurrency: int = 1,
- input_seqlen: int = 0,
+ input_seqlen: int = 1,
output_seqlen: int = 512,
test_round: int = 10,
tp: int = 1,
@@ -133,8 +133,10 @@ def profile_throughput(model_path: str,
)
# make up a prompt that can be tokenized into {input_seqlen} tokens
- prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
+ assert input_seqlen > 0, 'input_seqlen should > 0'
+ prompt = 'hi'
input_ids = tokenizer.encode(prompt)
+ input_ids = input_ids * input_seqlen
warmup(tm_model,
concurrency,
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index d1f6ebf80e..394c7ec1b9 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -1,98 +1,73 @@
import json
-import multiprocessing as mp
import random
import time
-from typing import Iterable, List
+from queue import Queue
+from threading import Thread
import fire
import numpy as np
-import requests
+from lmdeploy.serve.openai.api_client import get_streaming_response
from lmdeploy.tokenizer import Tokenizer
-from lmdeploy.utils import get_logger
-
-
-def get_streaming_response(prompt: str,
- api_url: str,
- session_id: int,
- request_output_len: int,
- stream: bool = True,
- sequence_start: bool = True,
- sequence_end: bool = False,
- ignore_eos: bool = False) -> Iterable[List[str]]:
- headers = {'User-Agent': 'Test Client'}
- pload = {
- 'prompt': prompt,
- 'stream': stream,
- 'session_id': session_id,
- 'request_output_len': request_output_len,
- 'sequence_start': sequence_start,
- 'sequence_end': sequence_end,
- 'ignore_eos': ignore_eos
- }
- response = requests.post(api_url,
- headers=headers,
- json=pload,
- stream=stream)
- for chunk in response.iter_lines(chunk_size=8192,
- decode_unicode=False,
- delimiter=b'\n'):
- if chunk:
- data = json.loads(chunk.decode('utf-8'))
- output = data['text']
- tokens = data['tokens']
- yield output, tokens
-
-
-def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
- res_que: mp.Queue):
+
+
+def infer(server_addr: str, session_id: int, req_queue: Queue, res_que: Queue,
+ stream_output: bool):
stats = []
- while not req_queue.empty():
- prompt, input_seqlen, output_seqlen = req_queue.get()
- get_logger('profile_restful_api').info(
- f'request info: session {session_id}, '
- f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}')
+ for prompt, input_seqlen, output_seqlen in iter(req_queue.get,
+ [None, None, None]):
+ if prompt is None:
+ break
timestamps = []
tokens = []
- start = time.perf_counter()
- for res, token in get_streaming_response(
+ timestamps.append(time.perf_counter())
+ for res, token, status in get_streaming_response(
prompt,
server_addr,
session_id,
request_output_len=output_seqlen,
- sequence_start=True,
- sequence_end=True):
+ interactive_mode=False,
+ ignore_eos=True,
+ stream=stream_output):
timestamps.append(time.perf_counter())
tokens.append(token)
- first_token_latency = timestamps[1] - start
- token_latency = timestamps[-1] - timestamps[0]
- token = tokens[-1] - tokens[0]
- stats.append([first_token_latency, token, token_latency])
+ first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
+ token_latency = np.round(timestamps[-1] - timestamps[0], 3)
+ completion_tokens = tokens[-1]
+ total_tokens = tokens[-1] + input_seqlen
+ stats.append([
+ first_token_latency, completion_tokens, output_seqlen,
+ total_tokens, token_latency
+ ])
+ print(f'session {session_id}: '
+ f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+ f'completion_tokens {completion_tokens}')
res_que.put((session_id, stats))
def warmup(server_addr: str,
concurrency: int,
output_seqlen: int,
- warmup_round: int = 1):
+ warmup_round: int = 1,
+ stream_output: bool = False):
print('start to warmup ...')
def _infer(server_addr, session_id):
for _ in range(warmup_round):
- for _, _ in get_streaming_response(
- '',
- server_addr,
- session_id,
- request_output_len=output_seqlen,
- sequence_start=True,
- sequence_end=True):
+ for _ in get_streaming_response('',
+ server_addr,
+ session_id,
+ request_output_len=output_seqlen,
+ interactive_mode=False,
+ stream=stream_output,
+ ignore_eos=True):
continue
_start = time.perf_counter()
procs = []
for i in range(concurrency):
- proc = mp.Process(target=_infer, args=(server_addr, i + 1))
+ proc = Thread(target=_infer, args=(server_addr, i + 1))
procs.append(proc)
proc.start()
for proc in procs:
@@ -115,6 +90,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
print(f'elapsed time for read data: '
f'{round(time.perf_counter() - start, 2)} s')
+ print('start tokenization. This takes a while, please wait...')
start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
@@ -136,9 +112,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
if samples > 0:
filtered_dataset = random.sample(filtered_dataset, samples)
- que = mp.Queue()
+ que = Queue()
for data in filtered_dataset:
que.put(data)
+ que.put((None, None, None))
print(f'elapsed time for filtering: '
f'{round(time.perf_counter() - start, 2)} s')
return que, len(filtered_dataset)
@@ -149,17 +126,20 @@ def main(server_addr: str,
dataset_path: str,
concurrency: int = 1,
session_len: int = 2048,
- samples: int = 1000):
- api_url = server_addr + '/generate'
- warmup(api_url, concurrency, session_len - 1)
+ samples: int = 1000,
+ stream_output: bool = False):
+ api_url = server_addr + '/v1/chat/interactive'
+ warmup(api_url, concurrency, session_len - 1, 4, stream_output)
req_queue, n_req = read_dataset(tokenizer_path, dataset_path, samples,
session_len)
- res_que = mp.Queue()
+ for i in range(concurrency):
+ req_queue.put([None, None, None])
+ res_que = Queue()
procs = []
_start = time.perf_counter()
for i in range(concurrency):
- proc = mp.Process(target=infer,
- args=(api_url, i + 1, req_queue, res_que))
+ proc = Thread(target=infer,
+ args=(api_url, i + 1, req_queue, res_que, stream_output))
procs.append(proc)
proc.start()
for proc in procs:
@@ -174,22 +154,40 @@ def main(server_addr: str,
f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
stats.append(np.array(_stats))
- stats = np.concatenate(stats).reshape(-1, 3)
+ stats = np.concatenate(stats).reshape(-1, 5)
first_token_latency_min = np.min(stats[:, 0], axis=0)
first_token_latency_max = np.max(stats[:, 0], axis=0)
first_token_latency_ave = np.mean(stats[:, 0], axis=0)
- token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
- req_throughput = n_req / elapsed_time
+ completion_tokens = np.sum(stats[:, 1], axis=0)
+ request_output_tokens = np.sum(stats[:, 2], axis=0)
+ total_tokens = np.sum(stats[:, 3], axis=0)
+ prompt_tokens = total_tokens - completion_tokens
+ completion_token_throughput = completion_tokens / elapsed_time
+ total_token_throughput = total_tokens / elapsed_time
+ rqs = n_req / elapsed_time
+ rqm = rqs * 60
+
+ if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+ print(f'Did not generate requested number of tokens. '
+ f'Request {request_output_tokens:.0f}, '
+ f'but got {completion_tokens:.0f}')
print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
- f'elapsed_time: {elapsed_time:.2f}s\n'
- f'first_token latency(min, max, ave): '
- f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
- f'{first_token_latency_ave:.2f}s\n'
- f'token throughput: {token_throughput:.2f} token/s\n'
- f'req throughput: {req_throughput:.2f} req/s\n'
- f'{"-" * 50}\n')
+ f'elapsed_time: {elapsed_time:.3f}s\n')
+ if stream_output:
+ print(f'first_token latency(min, max, ave): '
+ f'{first_token_latency_min:.3f}s, '
+ f'{first_token_latency_max:.3f}s, '
+ f'{first_token_latency_ave:.3f}s\n')
+ print(
+ f'number of prompt tokens: {prompt_tokens:.0f}\n'
+ f'number of completion tokens: {completion_tokens:.0f}\n'
+ f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
+ f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
+ f'RPS (request per second): {rqs:.3f} req/s\n'
+ f'RPM (request per minute): {rqm:.3f} req/min\n'
+ f'{"-" * 50}\n')
if __name__ == '__main__':
diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py
index 4580757eeb..ee23452d8a 100644
--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -17,7 +17,7 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
[None, None, None]):
timestamps = []
tokens = []
- start = time.perf_counter()
+ timestamps.append(time.perf_counter())
for status, res, token in chatbot.stream_infer(
session_id,
prompt,
@@ -26,13 +26,17 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
sequence_end=True):
timestamps.append(time.perf_counter())
tokens.append(token)
-
- first_token_latency = np.round(timestamps[1] - start, 3)
+ first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
token_latency = np.round(timestamps[-1] - timestamps[0], 3)
- token = tokens[-1] - tokens[0]
- stats.append([first_token_latency, token, token_latency])
+ completion_tokens = tokens[-1]
+ total_tokens = tokens[-1] + input_seqlen
+ stats.append([
+ first_token_latency, completion_tokens, output_seqlen,
+ total_tokens, token_latency
+ ])
print(f'session {session_id}: '
- f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}')
+ f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+ f'completion_tokens {completion_tokens}')
res_que.put((session_id, stats))
@@ -84,6 +88,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
completions = [completion for _, completion in dataset]
print(f'elapsed time for read data: '
f'{round(time.perf_counter() - start, 2)} s')
+ print('start tokenization. This takes a while, please wait...')
start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
@@ -124,7 +129,6 @@ def main(tritonserver_addr: str,
res_que = mp.Queue()
procs = []
- _start = time.perf_counter()
for i in range(concurrency):
chatbot = Chatbot(tritonserver_addr=tritonserver_addr,
display=False,
@@ -134,13 +138,15 @@ def main(tritonserver_addr: str,
proc = mp.Process(target=infer,
args=(chatbot, i + 1, req_que, res_que))
procs.append(proc)
- proc.start()
# read data and put it to queue
n_req = read_dataset(tokenizer_path, dataset_path, samples, session_len,
req_que)
for i in range(concurrency):
req_que.put([None, None, None])
+ _start = time.perf_counter()
+ for proc in procs:
+ proc.start()
stats = []
for i in range(concurrency):
@@ -149,27 +155,42 @@ def main(tritonserver_addr: str,
f'session {session_id}: processed reqs {len(_stats)}, '
f'stats: \n{_stats}\n{"-" * 50}\n')
stats.append(np.array(_stats))
-
_end = time.perf_counter()
+
elapsed_time = _end - _start
- stats = np.concatenate(stats).reshape(-1, 3)
+ stats = np.concatenate(stats).reshape(-1, 5)
first_token_latency_min = np.min(stats[:, 0], axis=0)
first_token_latency_max = np.max(stats[:, 0], axis=0)
first_token_latency_ave = np.mean(stats[:, 0], axis=0)
- token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time
- req_throughput = n_req / elapsed_time
-
- print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
- f'elapsed_time: {elapsed_time:.3f}s\n'
- f'first_token latency(min, max, ave): '
- f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, '
- f'{first_token_latency_ave:.3f}s\n'
- f'token throughput: {token_throughput:.3f} token/s\n'
- f'req throughput: {req_throughput:.3f} req/s\n'
- f'{"-" * 50}\n')
-
+ completion_tokens = np.sum(stats[:, 1], axis=0)
+ request_output_tokens = np.sum(stats[:, 2], axis=0)
+ total_tokens = np.sum(stats[:, 3], axis=0)
+ prompt_tokens = total_tokens - completion_tokens
+ completion_token_throughput = completion_tokens / elapsed_time
+ total_token_throughput = total_tokens / elapsed_time
+ rqs = n_req / elapsed_time
+ rqm = rqs * 60
+
+ if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+ print(f'Did not generate requested number of tokens. '
+ f'Request {request_output_tokens:.0f}, '
+ f'but got {completion_tokens:.0f}')
+
+ print(
+ f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
+ f'elapsed_time: {elapsed_time:.3f}s\n'
+ f'first_token latency(min, max, ave): '
+ f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, '
+ f'{first_token_latency_ave:.3f}s\n'
+ f'number of prompt tokens: {prompt_tokens:.0f}\n'
+ f'number of completion tokens: {completion_tokens:.0f}\n'
+ f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
+ f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
+ f'RPS (request per second): {rqs:.3f} req/s\n'
+ f'RPM (request per minute): {rqm:.3f} req/min\n'
+ f'{"-" * 50}\n')
for proc in procs:
proc.join()
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 610fbb7657..77a0b6f242 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -8,6 +8,7 @@
from typing import List, Tuple
import fire
+import numpy as np
from lmdeploy.tokenizer import Tokenizer
@@ -80,88 +81,137 @@ def __init__(self, model_path: str, tp: int = 1):
self.tm_model = tm_model
self.tokenizer = tokenizer
- def _inference(self, queue, session_id: int):
-
+ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
+ stream_output: bool):
model_inst = self.tm_model.create_instance()
- while True:
- request = queue.get()
- if request is None:
- # stop signal
- queue.put(None)
- return
- else:
- prompt, _, output_seqlen = request
- input_ids = self.tokenizer.encode(prompt)
-
- for outputs in model_inst.stream_infer(
- session_id,
- input_ids=input_ids,
- request_output_len=output_seqlen,
- temperature=1.0,
- top_p=1.0,
- sequence_start=True,
- sequence_end=True,
- ignore_eos=True,
- sampling_param=self.sampling_param):
- if len(outputs) > 1:
- res, tokens = outputs[-2:]
- else:
- res, tokens = outputs[0]
- self.tokenizer.decode(res)
-
- # for pytorch engine to restart a session
- if hasattr(model_inst, 'end'):
- model_inst.end(session_id)
-
- def process_request(self, requests, concurrency: int = 1):
- q = Queue()
+ stats = []
+ timestamps = []
+ tokens = []
+ timestamps.append(time.perf_counter())
+ for prompt, input_seqlen, output_seqlen in iter(
+ req_queue.get, [None, None, None]):
+ input_ids = self.tokenizer.encode(prompt)
+ offset = 0
+ for outputs in model_inst.stream_infer(
+ session_id,
+ input_ids=input_ids,
+ request_output_len=output_seqlen,
+ temperature=1.0,
+ top_p=1.0,
+ sequence_start=True,
+ sequence_end=True,
+ ignore_eos=True,
+ stream_output=stream_output):
+ if len(outputs) > 1:
+ res, token = outputs[-2:]
+ else:
+ res, token = outputs[0]
+ self.tokenizer.decode(res, offset)
+ offset = token
+ timestamps.append(time.perf_counter())
+ tokens.append(token)
+ # for pytorch engine to restart a session
+ if hasattr(model_inst, 'end'):
+ model_inst.end(session_id)
+ first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
+ token_latency = np.round(timestamps[-1] - timestamps[0], 3)
+ completion_tokens = tokens[-1]
+ total_tokens = tokens[-1] + len(input_ids)
+ stats.append([
+ first_token_latency, completion_tokens, output_seqlen,
+ total_tokens, token_latency
+ ])
+ print(
+ f'session {session_id}: '
+ f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, '
+ f'completion_tokens {completion_tokens}')
+ res_queue.put((session_id, stats))
+
+ def process_request(self,
+ requests,
+ concurrency: int = 1,
+ stream_output: bool = True):
+ res_queue = Queue()
+ req_queue = Queue()
threads = []
+ # feed request to q
+ for req in requests:
+ req_queue.put(req)
+ for i in range(concurrency):
+ req_queue.put([None, None, None])
+
start = time.time()
# start threads
for i in range(concurrency):
- t = Thread(target=self._inference, args=(q, i))
+ t = Thread(target=self._inference,
+ args=(req_queue, res_queue, i, stream_output))
t.start()
threads.append(t)
- # feed request to q
- for req in requests:
- q.put(req)
-
- q.put(None)
-
# wait for finish
for t in threads:
t.join()
- end = time.time()
-
- return end - start
+ elapsed_time = time.time() - start
+
+ stats = []
+ while not res_queue.empty():
+ session_id, _stats = res_queue.get()
+ print(f'\n{"-" * 50}\n'
+ f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
+ stats.append(np.array(_stats))
+
+ stats = np.concatenate(stats).reshape(-1, 5)
+
+ first_token_latency_min = np.min(stats[:, 0], axis=0)
+ first_token_latency_max = np.max(stats[:, 0], axis=0)
+ first_token_latency_ave = np.mean(stats[:, 0], axis=0)
+ completion_tokens = np.sum(stats[:, 1], axis=0)
+ request_output_tokens = np.sum(stats[:, 2], axis=0)
+ total_tokens = np.sum(stats[:, 3], axis=0)
+ prompt_tokens = total_tokens - completion_tokens
+ completion_token_throughput = completion_tokens / elapsed_time
+ total_token_throughput = total_tokens / elapsed_time
+ rqs = len(requests) / elapsed_time
+ rqm = rqs * 60
+
+ if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
+ print(f'Did not generate requested number of tokens. '
+ f'Request {request_output_tokens:.0f}, '
+ f'but got {completion_tokens:.0f}')
+
+ print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
+ f'elapsed_time: {elapsed_time:.3f}s\n')
+ if stream_output:
+ print(f'first_token latency(min, max, ave): '
+ f'{first_token_latency_min:.3f}s, '
+ f'{first_token_latency_max:.3f}s, '
+ f'{first_token_latency_ave:.3f}s\n')
+ print(
+ f'number of prompt tokens: {prompt_tokens:.0f}\n'
+ f'number of completion tokens: {completion_tokens:.0f}\n'
+ f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
+ f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
+ f'RPS (request per second): {rqs:.3f} req/s\n'
+ f'RPM (request per minute): {rqm:.3f} req/min\n'
+ f'{"-" * 50}\n')
def main(dataset: str,
model_path: str,
concurrency: int = 1,
num_prompts: int = 1000,
- tp: int = 1):
+ tp: int = 1,
+ stream_output: bool = True):
engine = Engine(model_path, tp=tp)
tokenizer = engine.tokenizer
requests = sample_requests(dataset, num_prompts, tokenizer)
- elapsed_time = engine.process_request(requests, concurrency)
- total_num_tokens = sum(prompt_len + output_len
- for _, prompt_len, output_len in requests)
- total_num_out_tokens = sum(output_len for _, _, output_len in requests)
- print(f'Throughput requests: {len(requests) / elapsed_time:.2f} req/s')
- print(
- f'Throughput requests: {len(requests) * 60 / elapsed_time:.2f} req/min'
- )
- print(f'Throughput tokens: {total_num_tokens / elapsed_time:.2f} tokens/s')
- print('Throughput tokens(output only):'
- f'{total_num_out_tokens / elapsed_time:.2f} tokens/s')
+ engine.process_request(requests, concurrency, stream_output)
if __name__ == '__main__':
diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh
index abb90562a2..8d1eb16de9 100755
--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -11,7 +11,7 @@ source /opt/conda/bin/activate
conda activate $PYTHON_VERSION
cd lmdeploy
-mkdir build && cd build
+mkdir -p build && cd build && rm -rf *
bash ../generate.sh
make -j$(nproc) && make install
if [ $? != 0 ]; then
diff --git a/docs/en/build.md b/docs/en/build.md
index 7ee53ac90c..cb278073c9 100644
--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -1,22 +1,79 @@
-## Build from source
+# Build from source
-- install packages for compiling and running:
+LMDeploy provides prebuilt package that can be easily installed by `pip install lmdeploy`.
- ```shell
- conda create -n lmdeploy python=3.10
- conda activate lmdeploy
+If you have requests to build lmdeploy from source, please clone lmdeploy repository from GitHub, and follow instructions in next sections
- git clone https://github.com/InternLM/lmdeploy.git
- cd lmdeploy
+```shell
+git clone --depth=1 https://github.com/InternLM/lmdeploy
+```
- pip install -r requirements.txt
- conda install openmpi-mpicxx nccl rapidjson -c conda-forge
- ```
+## Build in Docker (recommended)
+
+We highly advise using the provided docker image for lmdeploy build to circumvent complex environment setup.
+
+The docker image is `openmmlab/lmdeploy-builder:cuda11.8`. Make sure that docker is installed before using this image.
+
+In the root directory of the lmdeploy source code, please run the following command:
+
+```shell
+cd lmdeploy # the home folder of lmdeploy source code
+bash builder/manywheel/build_all_wheel.sh
+```
+
+All the wheel files for lmdeploy under py3.8 - py3.11 will be found in the `builder/manywheel/cuda11.8_dist` directory, such as,
+
+```text
+builder/manywheel/cuda11.8_dist/
+├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl
+├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl
+└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl
+```
+
+If the wheel file for a specific Python version is required, such as py3.8, please execute:
+
+```shell
+bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist
+```
+
+And the wheel file will be found in the `builder/manywheel/cuda11.8_dist` directory.
+
+You can use `pip install` to install the wheel file that matches the Python version on your host machine.
-- build and install lmdeploy:
+## Build in localhost (optional)
+Firstly, please make sure gcc version is no less than 9, which can be conformed by `gcc --version`.
+
+Then, follow the steps below to set up the compilation environment:
+
+- install the dependent packages:
+ ```shell
+ pip install -r requirements.txt
+ apt-get install rapidjson-dev
+ ```
+- install [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), and set environment variables:
+ ```shell
+ export NCCL_ROOT_DIR=/path/to/nccl/build
+ export NCCL_LIBRARIES=/path/to/nccl/build/lib
+ ```
+- install openmpi from source:
+ ```shell
+ wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz
+ tar xf openmpi-4.1.5.tar.gz
+ cd openmpi-4.1.5
+ ./configure
+ make -j$(nproc) && make install
+ ```
+- build and install lmdeploy libraries:
```shell
+ cd lmdeploy # the home folder of lmdeploy
mkdir build && cd build
sh ../generate.sh
make -j$(nproc) && make install
```
+- install lmdeploy python package:
+ ```shell
+ cd ..
+ pip install -e .
+ ```
diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md
index 1f5f5aa125..5dcf43ba68 100644
--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory.
```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
```
If you already have a workspace directory, skip this step.
@@ -29,7 +29,7 @@ Get the quantization parameters by these two steps:
```bash
# get minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
--model $HF_MODEL \
--calib_dataset 'c4' \ # Support c4, ptb, wikitext2, pileval
--calib_samples 128 \ # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
--work_dir $WORK_DIR \ # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
# get quant parameters
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
--work_dir $WORK_DIR \ # Directory of the last output
--turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
--kv_sym False \ # Symmetric or asymmetric quantization, default is False
@@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre
Test the chat performance.
```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
## GPU Memory Test
diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md
index e3662ab373..e4cd5a9cbe 100644
--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and
**Example 1**: Chat with default setting
```shell
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
+lmdeploy chat torch $PATH_TO_HF_MODEL
```
**Example 2**: Disable sampling and chat history
```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--temperature 0 --max-history 0
```
@@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \
**Example 3**: Accelerate with deepspeed inference
```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed
```
diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md
index cb70e26375..7f49edce1e 100644
--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -3,56 +3,61 @@
### Launch Service
```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage.
-We provide four restful api in total. Three of them are in OpenAI format. However, we recommend users try
-our own api which provides more arguments for users to modify. The performance is comparatively better.
+We provide four restful api in total. Three of them are in OpenAI format.
+
+- /v1/chat/completions
+- /v1/models
+- /v1/completions
+
+However, we recommend users try
+our own api `/v1/chat/interactive` which provides more arguments for users to modify. The performance is comparatively better.
+
+**Note** please, if you want to launch multiple requests, you'd better set different `session_id` for both
+`/v1/chat/completions` and `/v1/chat/interactive` apis. Or, we will set them random values.
### python
-Here is an example for our own api `generate`.
+We have integrated the client-side functionalities of these services into the `APIClient` class. Below are some examples demonstrating how to invoke the `api_server` service on the client side.
+
+If you want to use the `/v1/chat/completions` endpoint, you can try the following code:
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+messages = [{"role": "user", "content": "Say this is a test!"}]
+for item in api_client.chat_completions_v1(model=model_name, messages=messages):
+ print(item)
+```
+
+For the `/v1/completions` endpoint. If you want to use the `/v1/completions` endpoint, you can try:
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+for item in api_client.completions_v1(model=model_name, prompt='hi'):
+ print(item)
+```
+
+Lmdeploy supports maintaining session histories on the server for `/v1/chat/interactive` api. We disable the
+feature by default.
+
+- On interactive mode, the chat history is kept on the server. In a multiple rounds of conversation, you should set
+ `interactive_mode = True` and the same `session_id` (can't be -1, it's the default number) to `/v1/chat/interactive` for requests.
+- On normal mode, no chat history is kept on the server.
+
+The interactive mode can be controlled by the `interactive_mode` boolean parameter. The following is an example of normal mode. If you want to experience the interactive mode, simply pass in `interactive_mode=True`.
```python
-import json
-import requests
-from typing import Iterable, List
-
-
-def get_streaming_response(prompt: str,
- api_url: str,
- session_id: int,
- request_output_len: int,
- stream: bool = True,
- sequence_start: bool = True,
- sequence_end: bool = True,
- ignore_eos: bool = False) -> Iterable[List[str]]:
- headers = {'User-Agent': 'Test Client'}
- pload = {
- 'prompt': prompt,
- 'stream': stream,
- 'session_id': session_id,
- 'request_output_len': request_output_len,
- 'sequence_start': sequence_start,
- 'sequence_end': sequence_end,
- 'ignore_eos': ignore_eos
- }
- response = requests.post(
- api_url, headers=headers, json=pload, stream=stream)
- for chunk in response.iter_lines(
- chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
- if chunk:
- data = json.loads(chunk.decode('utf-8'))
- output = data['text']
- tokens = data['tokens']
- yield output, tokens
-
-
-for output, tokens in get_streaming_response(
- "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0,
- 512):
- print(output, end='')
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+for item in api_client.generate(prompt='hi'):
+ print(item)
```
### Java/Golang/Rust
@@ -84,16 +89,15 @@ List Models:
curl http://{server_ip}:{server_port}/v1/models
```
-Generate:
+Interactive Chat:
```bash
-curl http://{server_ip}:{server_port}/generate \
+curl http://{server_ip}:{server_port}/v1/chat/interactive \
-H "Content-Type: application/json" \
-d '{
"prompt": "Hello! How are you?",
"session_id": 1,
- "sequence_start": true,
- "sequence_end": true
+ "interactive_mode": true
}'
```
@@ -104,19 +108,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "internlm-chat-7b",
- "messages": [{"role": "user", "content": "Hello! Ho are you?"}]
+ "messages": [{"role": "user", "content": "Hello! How are you?"}]
}'
```
-Embeddings:
+Text Completions:
-```bash
-curl http://{server_ip}:{server_port}/v1/embeddings \
- -H "Content-Type: application/json" \
+```shell
+curl http://{server_ip}:{server_port}/v1/completions \
+ -H 'Content-Type: application/json' \
-d '{
- "model": "internlm-chat-7b",
- "input": "Hello world!"
- }'
+ "model": "llama",
+ "prompt": "two steps to build a house:"
+}'
```
### CLI client
@@ -125,7 +129,7 @@ There is a client script for restful api server.
```shell
# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client api_server_url
```
### webui
@@ -133,10 +137,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
You can also test restful-api through webui.
```shell
-# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
### FAQ
@@ -146,10 +150,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
2. When OOM appeared at the server side, please reduce the number of `instance_num` when lanching the service.
-3. When the request with the same `session_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards.
-
-4. Requests were previously being handled sequentially rather than concurrently. To resolve this issue,
-
- - kindly provide unique session_id values when calling the `generate` API or else your requests may be associated with client IP addresses
+3. When the request with the same `session_id` to `/v1/chat/interactive` got a empty return value and a negative `tokens`, please consider setting `interactive_mode=false` to restart the session.
-5. Both `generate` api and `v1/chat/completions` upport engaging in multiple rounds of conversation, where input `prompt` or `messages` consists of either single strings or entire chat histories.These inputs are interpreted using multi-turn dialogue modes. However, ff you want to turn the mode of and manage the chat history in clients, please the parameter `sequence_end: true` when utilizing the `generate` function, or specify `renew_session: true` when making use of `v1/chat/completions`
+4. The `/v1/chat/interactive` api disables engaging in multiple rounds of conversation by default. The input argument `prompt` consists of either single strings or entire chat histories.
diff --git a/docs/en/serving.md b/docs/en/serving.md
index 1e6f783d7a..6cc18018d0 100644
--- a/docs/en/serving.md
+++ b/docs/en/serving.md
@@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
bash workspace/service_docker_up.sh
```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
bash workspace/service_docker_up.sh
```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
70B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
bash workspace/service_docker_up.sh
```
@@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh
```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh
```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
30B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
--tokenizer_path /path/to/tokenizer/model --tp 4
bash workspace/service_docker_up.sh
```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
65B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
--tokenizer_path /path/to/tokenizer/model --tp 8
bash workspace/service_docker_up.sh
```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
bash workspace/service_docker_up.sh
```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
bash workspace/service_docker_up.sh
```
diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md
index 1b51402056..78f4d2ce5d 100644
--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu
python3 -m pip install lmdeploy
# convert weight layout
-python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+lmdeploy convert codellama /the/path/of/codellama/model
```
Then, you can communicate with codellama in consolo by following instructions in next sections
@@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in
### Completion
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
```
### Infilling
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
```
The input code is supposed to have a special placeholder ``. For example,
@@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in ` Iterable[List[str]]:
- headers = {'User-Agent': 'Test Client'}
- pload = {
- 'prompt': prompt,
- 'stream': stream,
- 'session_id': session_id,
- 'request_output_len': request_output_len,
- 'sequence_start': sequence_start,
- 'sequence_end': sequence_end,
- 'ignore_eos': ignore_eos
- }
- response = requests.post(
- api_url, headers=headers, json=pload, stream=stream)
- for chunk in response.iter_lines(
- chunk_size=8192, decode_unicode=False, delimiter=b'\n'):
- if chunk:
- data = json.loads(chunk.decode('utf-8'))
- output = data['text']
- tokens = data['tokens']
- yield output, tokens
-
-
-for output, tokens in get_streaming_response(
- "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0,
- 512):
- print(output, end='')
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+model_name = api_client.available_models[0]
+for item in api_client.completions_v1(model=model_name, prompt='hi'):
+ print(item)
+```
+
+LMDeploy 的 `/v1/chat/interactive` api 支持将对话内容管理在服务端,但是我们默认关闭。如果想尝试,请阅读以下介绍:
+
+- 交互模式下,对话历史保存在 server。在一次完整的多轮对话中,所有请求设置`interactive_mode = True`, `session_id`保持相同 (不为 -1,这是缺省值)。
+- 非交互模式下,server 不保存历史记录。
+
+交互模式可以通过 `interactive_mode` 布尔量参数控制。下面是一个普通模式的例子,
+如果要体验交互模式,将 `interactive_mode=True` 传入即可。
+
+```python
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient('http://{server_ip}:{server_port}')
+for item in api_client.generate(prompt='hi'):
+ print(item)
```
### Java/Golang/Rust
@@ -86,16 +86,15 @@ cURL 也可以用于查看 API 的输出结果
curl http://{server_ip}:{server_port}/v1/models
```
-使用 generate:
+Interactive Chat:
```bash
-curl http://{server_ip}:{server_port}/generate \
+curl http://{server_ip}:{server_port}/v1/chat/interactive \
-H "Content-Type: application/json" \
-d '{
"prompt": "Hello! How are you?",
"session_id": 1,
- "sequence_start": true,
- "sequence_end": true
+ "interactive_mode": true
}'
```
@@ -106,19 +105,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "internlm-chat-7b",
- "messages": [{"role": "user", "content": "Hello! Ho are you?"}]
+ "messages": [{"role": "user", "content": "Hello! How are you?"}]
}'
```
-Embeddings:
+Text Completions:
-```bash
-curl http://{server_ip}:{server_port}/v1/embeddings \
- -H "Content-Type: application/json" \
+```shell
+curl http://{server_ip}:{server_port}/v1/completions \
+ -H 'Content-Type: application/json' \
-d '{
- "model": "internlm-chat-7b",
- "input": "Hello world!"
- }'
+ "model": "llama",
+ "prompt": "two steps to build a house:"
+}'
```
### CLI client
@@ -126,8 +125,8 @@ curl http://{server_ip}:{server_port}/v1/embeddings \
restful api 服务可以通过客户端测试,例如
```shell
-# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
+lmdeploy serve api_client api_server_url
```
### webui
@@ -135,10 +134,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
也可以直接用 webui 测试使用 restful-api。
```shell
-# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
-# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# api_server_url 就是 api_server 产生的,比如 http://localhost:23333
+# server_name 和 server_port 是用来提供 gradio ui 访问服务的
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
### FAQ
@@ -148,12 +147,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
2. 当服务端显存 OOM 时,可以适当减小启动服务时的 `instance_num` 个数
-3. 当同一个 `session_id` 的请求给 `generate` 函数后,出现返回空字符串和负值的 `tokens`,应该是第二次问话没有设置 `sequence_start=false`
-
-4. 如果感觉请求不是并发地被处理,而是一个一个地处理,请设置好以下参数:
-
- - 不同的 session_id 传入 `generate` api。否则,我们将自动绑定会话 id 为请求端的 ip 地址编号。
+3. 当同一个 `session_id` 的请求给 `/v1/chat/interactive` 函数后,出现返回空字符串和负值的 `tokens`,应该是 `session_id` 混乱了,可以先将交互模式关闭,再重新开启。
-5. `generate` api 和 `v1/chat/completions` 均支持多轮对话。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问,也可以是一段对话历史。
- 两个 api 都是默认开启多伦对话的,如果你想关闭这个功能,然后在客户端管理会话记录,请设置 `sequence_end: true` 传入 `generate`,或者设置
- `renew_session: true` 传入 `v1/chat/completions`。
+4. `/v1/chat/interactive` api 支持多轮对话, 但是默认关闭。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问,也可以是一段对话历史。
diff --git a/docs/zh_cn/serving.md b/docs/zh_cn/serving.md
index e0a2f5a986..db4ebb8d3c 100644
--- a/docs/zh_cn/serving.md
+++ b/docs/zh_cn/serving.md
@@ -8,7 +8,7 @@
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
bash workspace/service_docker_up.sh
```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
bash workspace/service_docker_up.sh
```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
70B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
bash workspace/service_docker_up.sh
```
@@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh
7B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh
```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
13B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh
```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
30B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
--tokenizer_path /path/to/tokenizer/model --tp 4
bash workspace/service_docker_up.sh
```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
65B
```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
--tokenizer_path /path/to/tokenizer/model --tp 8
bash workspace/service_docker_up.sh
```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-7b \
--delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
bash workspace/service_docker_up.sh
```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
--target-model-path /path/to/vicuna-13b \
--delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
bash workspace/service_docker_up.sh
```
diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md
index ca9029a527..017df62b5f 100644
--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -29,7 +29,7 @@
python3 -m pip install lmdeploy
# 转模型格式
-python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+lmdeploy convert codellama /path/of/codellama/model
```
接下来,可参考如下章节,在控制台与 codellama 进行交互式对话。
@@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
### 代码续写
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
```
### 代码填空
```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
```
输入的代码块中要包含 ``,比如:
@@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str:
### 对话
```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
```
可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
### Python 专项
```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
```
建议这里部署 Python 微调模型
@@ -90,7 +90,7 @@ TBD
```shell
# --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
# --tp: 在 tensor parallel时,使用的GPU数量
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
```
打开 `http://{server_ip}:{server_port}`,即可访问 swagger,查阅 RESTful API 的详细信息。
@@ -98,17 +98,17 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --
你可以用命令行,在控制台与 server 通信:
```shell
-# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+# api_server_url 就是 api_server 产生的,比如 http://localhost:23333
+lmdeploy serve api_client api_server_url
```
或者,启动 gradio,在 webui 的聊天对话框中,与 codellama 交流:
```shell
-# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333
+# api_server_url 就是 api_server 产生的,比如 http://localhost:23333
# server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
关于 RESTful API的详细介绍,请参考[这份](../restful_api.md)文档。
diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md
index 68cc094df8..e0a220eb60 100644
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
```shell
## 转换模型的layout,存放在默认路径 ./workspace 下
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
--model-name llama2 \
--model-path ./llama2-chat-7b-w4 \
--model-format awq \
--group-size 128
## 推理
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
```
## 启动 gradio 服务
@@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
如果想通过 webui 与模型对话,请执行以下命令启动 gradio 服务
```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
```
然后,在浏览器中打开 http://{ip_addr}:{port},即可在线对话
@@ -82,7 +82,7 @@ python benchmark/profile_generation.py \
### 第一步:生成量化参数
```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
--model $HF_MODEL \
--calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
--calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
@@ -95,7 +95,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时,需要把步骤1的`$WORK_DIR`传入。量化结束后,权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明,进行模型推理。
```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
--model $HF_MODEL \
--w_bits 4 \ # 权重量化的 bit 数
--w_group_size 128 \ # 权重量化分组统计尺寸
diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py
new file mode 100644
index 0000000000..3575bec5bd
--- /dev/null
+++ b/lmdeploy/cli/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cli import run
+
+__all__ = ['run']
diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
new file mode 100644
index 0000000000..735b24c7cc
--- /dev/null
+++ b/lmdeploy/cli/chat.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+
+class SubCliChat(object):
+ """Chat through terminal with pytorch or turbomind model."""
+
+ def torch(self,
+ model_path: str,
+ tokenizer_path: Optional[str] = None,
+ accel: Optional[str] = None,
+ max_new_tokens: int = 128,
+ temperature: float = 0.8,
+ top_p: float = 0.95,
+ seed: int = 0,
+ use_fast_tokenizer: bool = True,
+ max_alloc: int = 2048,
+ max_session_len: int = None,
+ log_file: Optional[str] = None,
+ debug: bool = False,
+ adapter: Optional[str] = None):
+ """Chat with pytorch model through terminal.
+
+ Args:
+ model_path (str): Path to pytorch model.
+ tokenizer_path (str): Path to tokenizer.
+ accel (str): Model accelerator.
+ max_new_tokens (int): Maximum number of tokens to generate.
+ temperature (float): Temperature for sampling.
+ top_p (float): Top p for sampling.
+ seed (int): Random seed.
+ use_fast_tokenizer (bool): Whether to use fast tokenizer.
+ This argument is directly pass to transformer's
+ ``AutoTokenizer.from_pretrained``.
+ Generally, user should choose to use fast tokenizers.
+ But if using fast raise some error, try to force using a slow one.
+ max_alloc (int): Maximum memory to allocate (for deepspeed).
+ max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+ This include both history and current session.
+ log_file (str): Path to log file.
+ debug (bool): Whether to enable debug mode.
+ adapter (str): Force to use an adapter.
+ Generally user should not use this argument because adapter is selected based
+ on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+ based on `LlamaforCausalLM` class, this argument is required.
+ Currently, only "llama1" is acceptable for llama1 models.
+ """ # noqa: E501
+ from lmdeploy.pytorch.chat import main as run_torch_model
+
+ run_torch_model(model_path,
+ tokenizer_path=tokenizer_path,
+ accel=accel,
+ max_new_tokens=max_new_tokens,
+ temperature=temperature,
+ top_p=top_p,
+ seed=seed,
+ use_fast_tokenizer=use_fast_tokenizer,
+ max_alloc=max_alloc,
+ max_session_len=max_session_len,
+ log_file=log_file,
+ debug=debug,
+ adapter=adapter)
+
+ def turbomind(self,
+ model_path,
+ session_id: int = 1,
+ cap: str = 'chat',
+ tp=1,
+ stream_output=True,
+ **kwargs):
+ """Chat with turbomind model through terminal.
+
+ Args:
+ model_path (str): the path of the deployed model
+ session_id (int): the identical id of a session
+ cap (str): the capability of a model. For example, codellama has
+ the ability among ['completion', 'infilling', 'chat', 'python']
+ tp (int): GPU number used in tensor parallelism
+ stream_output (bool): indicator for streaming output or not
+ **kwarg (dict): other arguments for initializing model's chat
+ template
+ """
+ from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+ run_turbomind_model(model_path,
+ session_id=session_id,
+ cap=cap,
+ tp=tp,
+ stream_output=stream_output,
+ **kwargs)
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
new file mode 100644
index 0000000000..ab15cb46ad
--- /dev/null
+++ b/lmdeploy/cli/cli.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import fire
+
+from .chat import SubCliChat
+from .lite import SubCliLite
+from .serve import SubCliServe
+
+
+class CLI(object):
+ """LMDeploy Command Line Interface.
+
+ The CLI provides a unified API for converting, compressing and deploying
+ large language models.
+ """
+
+ def convert(self,
+ model_name: str,
+ model_path: str,
+ model_format: str = None,
+ tokenizer_path: str = None,
+ dst_path: str = './workspace',
+ tp: int = 1,
+ quant_path: str = None,
+ group_size: int = 0):
+ """Convert LLMs to lmdeploy format.
+
+ Args:
+ model_name (str): The name of the to-be-deployed model, such as
+ llama-7b, llama-13b, vicuna-7b and etc.
+ model_path (str): The directory path of the model
+ model_format (str): the format of the model, should choose from
+ ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+ format, 'hf' means huggingface llama format, and 'awq' means
+ llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+ the default value is None, which means the model_format will be
+ inferred based on model_name
+ tokenizer_path (str): The path of tokenizer model.
+ dst_path (str): The destination path that saves outputs.
+ tp (int): The number of GPUs used for tensor parallelism, which
+ should be 2^n.
+ quant_path (str): Path of the quantized model, which can be None.
+ group_size (int): A parameter used in AWQ to quantize fp16 weights
+ to 4 bits.
+ """
+ from lmdeploy.turbomind.deploy.converter import main as convert
+
+ convert(model_name,
+ model_path,
+ model_format=model_format,
+ tokenizer_path=tokenizer_path,
+ dst_path=dst_path,
+ tp=tp,
+ quant_path=quant_path,
+ group_size=group_size)
+
+ def list(self, engine: str = 'turbomind'):
+ """List supported model names.
+
+ Examples 1:
+ lmdeploy list
+
+ Examples 2:
+ lmdeploy list --engine pytorch
+
+ Args:
+ engine (str): The backend for the model to run. Choice from
+ ['turbomind', 'pytorch'].
+ """
+ assert engine in ['turbomind', 'pytorch']
+ if engine == 'pytorch':
+ model_names = ['llama', 'llama2', 'internlm-7b']
+ elif engine == 'turbomind':
+ from lmdeploy.model import MODELS
+ model_names = list(MODELS.module_dict.keys())
+ model_names = [n for n in model_names if n.lower() not in ['base']]
+ model_names.sort()
+ print('Supported model names:')
+ print('\n'.join(model_names))
+
+ def check_env(self, dump_file: str = None):
+ """Check env information.
+
+ Args:
+ dump_file (str): Output file to save env info.
+ """
+
+ import importlib
+
+ import mmengine
+ from mmengine.utils import get_git_hash
+ from mmengine.utils.dl_utils import collect_env
+
+ from lmdeploy.version import __version__
+
+ env_info = collect_env()
+ env_info['LMDeploy'] = __version__ + '+' + get_git_hash()[:7]
+
+ # remove some unnecessary info
+ remove_reqs = ['MMEngine', 'OpenCV']
+ for req in remove_reqs:
+ if req in env_info:
+ env_info.pop(req)
+
+ # extra important dependencies
+ extra_reqs = ['transformers', 'gradio', 'fastapi', 'pydantic']
+
+ for req in extra_reqs:
+ try:
+ env_info[req] = importlib.import_module(req).__version__
+ except Exception:
+ env_info[req] = 'Not Found'
+
+ # print env info
+ for k, v in env_info.items():
+ print(f'{k}: {v}')
+
+ # dump to local file
+ if dump_file is not None:
+ work_dir, _ = os.path.split(dump_file)
+ if work_dir:
+ os.makedirs(work_dir, exist_ok=True)
+ mmengine.dump(env_info, dump_file)
+
+
+def run():
+ """The entry point of running LMDeploy CLI."""
+
+ cli = CLI()
+ cli.lite = SubCliLite()
+ cli.chat = SubCliChat()
+ cli.serve = SubCliServe()
+
+ fire.Fire(cli, name='lmdeploy')
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
new file mode 100644
index 0000000000..4302765e28
--- /dev/null
+++ b/lmdeploy/cli/lite.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+class SubCliLite(object):
+ """CLI for compressing LLMs."""
+
+ def auto_awq(self,
+ model: str,
+ work_dir: str,
+ w_bits: int = 4,
+ w_sym: bool = False,
+ w_group_size: int = 128,
+ device: str = 'cuda'):
+ """Perform weight quantization using AWQ algorithm.
+
+ Args:
+ model (str): The path of model in hf format.
+ work_dir (str): The working directory to save results.
+ w_bits (int): Bit number for weight quantization.
+ w_sym (bool): Whether to do symmetric quantization.
+ w_group_size (int): Group size for weight quantization statistics.
+ device (str): Device type of running.
+ """
+ from lmdeploy.lite.apis.auto_awq import auto_awq
+
+ auto_awq(model,
+ work_dir,
+ w_bits=w_bits,
+ w_sym=w_sym,
+ w_group_size=w_group_size,
+ device=device)
+
+ def calibrate(self,
+ model: str,
+ calib_dataset: str = 'c4',
+ calib_samples: int = 128,
+ calib_seqlen: int = 2048,
+ work_dir: str = './work_dir',
+ device: str = 'cuda') -> None:
+ """Perform calibration on a given dataset.
+
+ Args:
+ model (str): The model to be loaded.
+ calib_dataset (str, optional): The calibration dataset name.
+ Defaults to 'c4'.
+ calib_samples (int, optional): The number of samples for
+ calibration. Defaults to 128.
+ calib_seqlen (int, optional): The sequence length for calibration.
+ Defaults to 2048.
+ work_dir (str): The working directory for outputs.
+ Defaults to './work_dir'.
+ device (str, optional): The device to be used for calculation.
+ Defaults to 'cuda'.
+ """
+ from lmdeploy.lite.apis.calibrate import calibrate
+
+ calibrate(model,
+ calib_dataset=calib_dataset,
+ calib_samples=calib_samples,
+ calib_seqlen=calib_seqlen,
+ work_dir=work_dir,
+ device=device)
+
+ def kv_qparams(self,
+ work_dir: str,
+ turbomind_dir: str,
+ kv_bits: int = 8,
+ kv_sym: bool = False,
+ num_tp: int = 1) -> None:
+ """Export key and value stats.
+
+ Args:
+ work_dir (str): Directory path where the stats
+ are saved.
+ turbomind_dir (str): Directory path where to
+ save the results.
+ kv_bits (int, optional): Number of bits for quantization.
+ Defaults to 8.
+ kv_sym (bool, optional): Whether to use symmetric quantization.
+ Defaults to False.
+ num_tp (int, optional): Number of tensor parallelism.
+ Defaults to 1.
+ """
+ from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+ run_kv_qparams(work_dir,
+ turbomind_dir,
+ kv_bits=kv_bits,
+ kv_sym=kv_sym,
+ num_tp=num_tp)
+
+ def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
+ """Convert a hugging face model to the smallest sharded one.
+
+ Args:
+ src_dir (str): The directory of the input HF model.
+ dst_dir (str): The directory to save new model.
+ """
+ from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+ run_sharded(src_dir, dst_dir)
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
new file mode 100644
index 0000000000..33580cdfe1
--- /dev/null
+++ b/lmdeploy/cli/serve.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+
+class SubCliServe(object):
+ """Serve LLMs and interact on terminal or web UI."""
+
+ def gradio(self,
+ model_path_or_server: str,
+ server_name: str = '0.0.0.0',
+ server_port: int = 6006,
+ batch_size: int = 32,
+ tp: int = 1,
+ restful_api: bool = False):
+ """Serve LLMs with web ui using gradio.
+
+ Example 1:
+ lmdeploy serve gradio ./workspace
+
+ Example 2:
+ lmdeploy serve gradio http://0.0.0.0:23333
+ --server_name 0.0.0.0
+ --server_port 6006
+ --restful_api True
+
+ Example 3:
+ lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+
+ Args:
+ model_path_or_server (str): the path of the deployed model or the
+ tritonserver URL or restful api URL. The former is for directly
+ running service with gradio. The latter is for running with
+ tritonserver by default. If the input URL is restful api.
+ Please enable another flag `restful_api`.
+ server_name (str): the ip address of gradio server
+ server_port (int): the port of gradio server
+ batch_size (int): batch size for running Turbomind directly
+ tp (int): tensor parallel for Turbomind
+ restful_api (bool): a flag for model_path_or_server
+ """
+ from lmdeploy.serve.gradio.app import run
+ run(model_path_or_server,
+ server_name=server_name,
+ server_port=server_port,
+ batch_size=batch_size,
+ tp=tp,
+ restful_api=restful_api)
+
+ def api_server(self,
+ model_path: str,
+ server_name: str = '0.0.0.0',
+ server_port: int = 23333,
+ instance_num: int = 32,
+ tp: int = 1,
+ allow_origins: List[str] = ['*'],
+ allow_credentials: bool = True,
+ allow_methods: List[str] = ['*'],
+ allow_headers: List[str] = ['*']):
+ """Serve LLMs with restful api using fastapi.
+
+ Args:
+ model_path (str): the path of the deployed model
+ server_name (str): host ip for serving
+ server_port (int): server port
+ instance_num (int): number of instances of turbomind model
+ tp (int): tensor parallel
+ allow_origins (List[str]): a list of allowed origins for CORS
+ allow_credentials (bool): whether to allow credentials for CORS
+ allow_methods (List[str]): a list of allowed HTTP methods for CORS
+ allow_headers (List[str]): a list of allowed HTTP headers for CORS
+ """
+ from lmdeploy.serve.openai.api_server import main as run_api_server
+
+ run_api_server(model_path,
+ server_name=server_name,
+ server_port=server_port,
+ instance_num=instance_num,
+ tp=tp,
+ allow_origins=allow_origins,
+ allow_credentials=allow_credentials,
+ allow_methods=allow_methods,
+ allow_headers=allow_headers)
+
+ def api_client(self, restful_api_url: str, session_id: int = 0):
+ """Interact with restful api server in terminal.
+
+ Args:
+ restful_api_url: The restful api URL.
+ session_id: The identical id of a session.
+ """
+ from lmdeploy.serve.openai.api_client import main as run_api_client
+ run_api_client(restful_api_url, session_id=session_id)
+
+ def triton_client(self,
+ tritonserver_addr: str,
+ session_id: int = 1,
+ cap: str = 'chat',
+ stream_output: bool = True,
+ **kwargs):
+ """Interact with Triton Server using gRPC protocol.
+
+ Args:
+ tritonserver_addr (str): the address in format "ip:port" of
+ triton inference server
+ session_id (int): the identical id of a session
+ cap (str): the capability of a model. For example, codellama
+ has the ability among ['completion', 'infill', 'instruct',
+ 'python']
+ stream_output (bool): indicator for streaming output or not
+ **kwargs (dict): other arguments for initializing model's
+ chat template
+ """
+
+ from lmdeploy.serve.client import main as run_triton_client
+
+ run_triton_client(
+ tritonserver_addr,
+ session_id=session_id,
+ cap=cap,
+ stream_output=stream_output,
+ **kwargs,
+ )
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 3517f51b85..38f067b563 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,7 +2,6 @@
from pathlib import Path
-import fire
import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
@@ -16,13 +15,15 @@
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
- 'BaiChuanForCausalLM': 'DecoderLayer',
+ 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
+ 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'QWenLMHeadModel': 'RMSNorm',
- 'BaiChuanForCausalLM': 'RMSNorm',
+ 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
+ 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm',
}
@@ -41,6 +42,9 @@ def auto_awq(model: str,
hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
checkpoint = hf_config._name_or_path
+ # hard code for qwen, other configs do not have the `fp16` attribute.
+ hf_config.fp16 = True
+
with init_empty_weights():
# Load model
model = AutoModelForCausalLM.from_pretrained(model,
@@ -62,11 +66,14 @@ def auto_awq(model: str,
device_map[name] = 'cpu'
else:
device_map[name] = 0
- load_checkpoint_in_model(model, checkpoint, device_map)
+ load_checkpoint_in_model(model,
+ checkpoint,
+ device_map,
+ dtype=torch.float16)
work_dir = Path(work_dir)
- act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean']
+ act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
layers = collect_target_modules(model, layer_type)
fcs = {}
for l_name, layer in layers.items():
@@ -81,5 +88,6 @@ def auto_awq(model: str,
if __name__ == '__main__':
+ import fire
fire.Fire(auto_awq)
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 38b6429a19..27d631bdad 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -1,11 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.
from pathlib import Path
+from typing import Union
-import fire
import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
+from torch import nn
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from lmdeploy.lite.quantization import CalibrationContext
@@ -14,17 +15,90 @@
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
- 'BaiChuanForCausalLM': 'DecoderLayer',
+ 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
+ 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'QWenLMHeadModel': 'RMSNorm',
- 'BaiChuanForCausalLM': 'RMSNorm',
+ 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
+ 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm',
}
+def _prepare_for_calibrate(model: nn.Module,
+ layer_type: Union[str, type],
+ head_name: str = 'lm_head',
+ device: str = 'cuda',
+ prefix: str = '') -> None:
+ """Prepare the model for calibration by moving specific modules to CPU.
+
+ This function goes through each child of a given model and checks whether
+ it is an instance of a certain layer type or has the name equal to
+ `head_name`.
+ If yes, it moves the module to CPU, otherwise to the specified device
+ (default is CUDA).
+
+ If the child contains the target layer type in its sub-modules, the
+ function performs the same operation recursively.
+
+ Parameters
+ ----------
+ model : nn.Module
+ The PyTorch model to prepare for calibration.
+ layer_type : Union[str, Type]
+ The type of the layer to be moved to CPU. Can be either a string of
+ class name or the class type itself.
+ head_name : str, optional
+ The name of the module to be moved to CPU. Default is 'lm_head'.
+ device : str, optional
+ The device to which modules not matching the `layer_type` or
+ `head_name` will be moved. Default is 'cuda'.
+ prefix : str, optional
+ The prefix used when printing the names of the moved modules.
+ Default is ''.
+
+ Raises
+ ------
+ TypeError
+ If `layer_type` is neither a string nor a type.
+ """
+
+ for name, child in model.named_children():
+
+ # Check if the child is an instance of the given layer type
+ if isinstance(layer_type, str):
+ is_layer = type(child).__name__ == layer_type
+ elif isinstance(layer_type, type):
+ is_layer = isinstance(child, layer_type)
+ else:
+ raise TypeError(
+ 'layer_type should be a string (class name) or a type')
+
+ # Check if the child contains the target module type
+ contain_layer = len(
+ collect_target_modules(child, layer_type, [head_name]).keys()) > 0
+
+ # Check if the child matches the head name
+ is_head = name == head_name
+
+ mod_name = f'{prefix}.{name}' if prefix else name
+
+ # If the child is either an instance of the layer type or has the
+ # head name, move it to CPU, otherwise move it to the specified device
+ if is_layer or is_head:
+ child.to('cpu')
+ print(f'Move {mod_name} to CPU.')
+ elif contain_layer:
+ _prepare_for_calibrate(child, layer_type, head_name, device,
+ mod_name)
+ else:
+ child.to(device)
+ print(f'Move {mod_name} to GPU.')
+
+
def calibrate(model: str,
calib_dataset: str = 'c4',
calib_samples: int = 128,
@@ -55,16 +129,38 @@ def calibrate(model: str,
tokenizer = AutoTokenizer.from_pretrained(model,
use_fast=False,
trust_remote_code=True)
- hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+ hf_config = AutoConfig.from_pretrained(model,
+ torch_dtype=torch.float16,
+ trust_remote_code=True)
checkpoint = hf_config._name_or_path
+ # hard code for qwen, other configs do not have the `fp16` attribute.
+ hf_config.fp16 = True
+
with init_empty_weights():
# Load model
model = AutoModelForCausalLM.from_pretrained(model,
+ config=hf_config,
torch_dtype=torch.float16,
trust_remote_code=True)
model.config.use_cache = False
+ model_type = type(model).__name__
+ if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
+ raise RuntimeError(
+ f'Currently, quantification and calibration of {model_type} are '
+ f'not supported. The supported model types are '
+ f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+
+ if model_type == 'QWenLMHeadModel':
+ try:
+ import flash_attn # noqa: F401
+ except ImportError:
+ raise RuntimeError(
+ 'When using Qwen, you need to `pip install flash-attn` first, '
+ 'otherwise calibration and quantification will not work '
+ 'properly.')
+
layer_type = LAYER_TYPE_MAP[type(model).__name__]
norm_type = NORM_TYPE_MAP[type(model).__name__]
@@ -78,7 +174,12 @@ def calibrate(model: str,
device_map[name] = 'cpu'
else:
device_map[name] = 0
- load_checkpoint_in_model(model, checkpoint, device_map)
+ load_checkpoint_in_model(model,
+ checkpoint,
+ device_map,
+ dtype=torch.float16)
+
+ _prepare_for_calibrate(model, layer_type, 'lm_head', device)
print('Loading calibrate dataset ...')
calib_loader, _ = get_calib_loaders(calib_dataset,
@@ -107,4 +208,6 @@ def calibrate(model: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(calibrate)
diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py
index 7d43078daf..f31fee0299 100644
--- a/lmdeploy/lite/apis/kv_qparams.py
+++ b/lmdeploy/lite/apis/kv_qparams.py
@@ -2,7 +2,6 @@
from pathlib import Path
from typing import Union
-import fire
import numpy as np
import torch
@@ -120,5 +119,6 @@ def main(work_dir: str,
if __name__ == '__main__':
+ import fire
fire.Fire(main)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index c9811563fd..4dca8b1469 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -18,6 +18,10 @@
'QWenBlock': {
'ln_1': ['attn.c_attn'],
'ln_2': ['mlp.w1', 'mlp.w2']
+ },
+ 'DecoderLayer': {
+ 'input_layernorm': ['self_attn.W_pack'],
+ 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
}
}
@@ -33,6 +37,10 @@
'QWenBlock': {
'attn.c_attn': ['attn.c_proj'],
'mlp.w1': ['mlp.c_proj']
+ },
+ 'DecoderLayer': {
+ 'self_attn.W_pack': ['self_attn.o_proj'],
+ 'mlp.up_proj': ['mlp.down_proj']
}
}
@@ -69,7 +77,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
w_scales = get_weight_scale(concat_w, group_size)
scales = (act_scales.pow(alpha) /
- w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+ w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
ln.weight.div_(scales)
@@ -116,10 +124,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
w_scales = get_weight_scale(concat_w, group_size)
scales = (act_scales.pow(alpha) /
- w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+ w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
- # (for qwen) pre_fc is packed QKV, only V needs to scale
+ # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
if size_pre_fc > size_a and size_pre_fc % size_a == 0 \
and size_pre_fc // size_a == 3:
diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py
index 56cfda8f01..1d01696eb9 100644
--- a/lmdeploy/lite/quantization/weight/quantizer.py
+++ b/lmdeploy/lite/quantization/weight/quantizer.py
@@ -8,7 +8,7 @@
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
- cal_qparams_per_tensor_minmax)
+ cal_qparams_per_tensor_minmax, precise_round)
from lmdeploy.lite.utils.global_avail import GlobalAvailMixin
@@ -119,8 +119,10 @@ def quant(self,
torch.Tensor: The fake quantized weight tensor.
"""
+ float_w = weight.float()
+
if qparams is None:
- qparams = self.calculate_qparams(weight)
+ qparams = self.calculate_qparams(float_w)
scales = qparams.scales
zero_points = qparams.zero_points
@@ -133,17 +135,18 @@ def quant(self,
# per group scales shape: [out_c, in_c//group_size, 1]
if len(scales.shape) > 2:
# scales shape: [out_c, in_c//group_size, 1]
- weight = weight.reshape(out_c, scales.shape[1], -1)
+ float_w = float_w.reshape(out_c, scales.shape[1], -1)
if zero_points is None:
assert self.symmetry
- real_qweight = (weight / scales).round()
+ real_qweight = (float_w / scales).round()
fake_qweight = real_qweight * scales
else:
assert not self.symmetry
- real_qweight = (weight / scales).round() + zero_points
+ real_qweight = precise_round(
+ (float_w - float_w.min(-1, keepdim=True)[0]) / scales)
fake_qweight = (real_qweight - zero_points) * scales
if len(scales.shape) > 2:
@@ -153,4 +156,4 @@ def quant(self,
if real:
return real_qweight.to(torch.int32)
else:
- return fake_qweight
+ return fake_qweight.to(weight.dtype)
diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
index c2b56287bd..2561fdb23f 100644
--- a/lmdeploy/lite/utils/__init__.py
+++ b/lmdeploy/lite/utils/__init__.py
@@ -6,7 +6,7 @@
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
- cal_qparams_per_tensor_minmax)
+ cal_qparams_per_tensor_minmax, precise_round)
from .calib_dataloader import get_calib_loaders
from .collect import (bimap_name_mod, collect_target_modules,
collect_target_weights)
@@ -16,7 +16,7 @@
'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax',
'cal_qparams_per_group_absmax', 'cal_qparams_per_group_minmax',
'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax',
- 'QParams', 'get_calib_loaders', 'collect_target_modules',
+ 'QParams', 'get_calib_loaders', 'collect_target_modules', 'precise_round',
'collect_target_weights', 'GlobalAvailMixin', 'split_decoder_layer_inputs',
'bimap_name_mod', 'concat_decoder_layer_outputs'
]
diff --git a/lmdeploy/lite/utils/cal_qparams.py b/lmdeploy/lite/utils/cal_qparams.py
index a682704a55..569297cdb5 100644
--- a/lmdeploy/lite/utils/cal_qparams.py
+++ b/lmdeploy/lite/utils/cal_qparams.py
@@ -11,16 +11,22 @@ class QParams(NamedTuple):
zero_points: Optional[torch.Tensor]
+@torch.no_grad()
+def precise_round(x):
+ return x.sign() * (x.abs() + 0.5).floor()
+
+
@torch.no_grad()
def cal_qparams_per_channel_absmax(w: torch.Tensor,
n_bits: int,
return_stats: bool = False) -> QParams:
"""Calculate quantization parameters for each channel using absolute max
value."""
+ float_w = w.float()
- absmax = w.abs().max(dim=-1, keepdim=True)[0]
+ absmax = float_w.abs().max(dim=-1, keepdim=True)[0]
q_max = 2**(n_bits - 1) - 1
- scales = absmax.clamp(min=1e-5).div(q_max)
+ scales = absmax.div(q_max)
if return_stats:
return QParams(scales=scales, zero_points=None), absmax
@@ -35,14 +41,16 @@ def cal_qparams_per_channel_minmax(w: torch.Tensor,
"""Calculate quantization parameters for each channel using min and max
values."""
- w_min = w.min(dim=-1, keepdim=True)[0]
- w_max = w.max(dim=-1, keepdim=True)[0]
+ float_w = w.float()
+
+ w_min = float_w.min(dim=-1, keepdim=True)[0]
+ w_max = float_w.max(dim=-1, keepdim=True)[0]
q_max = 2**n_bits - 1
scales = (w_max - w_min)
- scales = scales.clamp_(min=1e-5).div_(q_max)
+ scales = scales.div_(q_max)
- zero_points = (-w_min / scales).round()
+ zero_points = precise_round(-w_min / scales)
if return_stats:
return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
@@ -63,9 +71,12 @@ def cal_qparams_per_group_absmax(w: torch.Tensor,
'Input channels should be greater than or equal to group_size.'
assert inc % group_size == 0, \
'Input channels should be divisible by group_size.'
- absmax = w.abs().reshape(outc, -1, group_size).max(dim=-1, keepdim=True)[0]
+
+ float_w = w.float()
+ absmax = float_w.abs().reshape(outc, -1, group_size).max(dim=-1,
+ keepdim=True)[0]
q_max = 2**(n_bits - 1) - 1
- scales = absmax.clamp(min=1e-5).div(q_max)
+ scales = absmax.div(q_max)
if return_stats:
return QParams(scales=scales, zero_points=None), absmax
else:
@@ -85,14 +96,16 @@ def cal_qparams_per_group_minmax(w: torch.Tensor,
'Input channels should be greater than or equal to group_size.'
assert inc % group_size == 0, \
'Input channels should be divisible by group_size.'
- w_group_wise = w.reshape(outc, -1, group_size)
+
+ float_w = w.float()
+ w_group_wise = float_w.reshape(outc, -1, group_size)
w_min = w_group_wise.min(dim=-1, keepdim=True)[0]
w_max = w_group_wise.max(dim=-1, keepdim=True)[0]
q_max = 2**n_bits - 1
scales = (w_max - w_min)
- scales = scales.clamp_(min=1e-5).div_(q_max)
- zero_points = (-w_min / scales).round()
+ scales = scales.div_(q_max)
+ zero_points = precise_round(-w_min / scales)
if return_stats:
return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
else:
@@ -106,13 +119,15 @@ def cal_qparams_per_tensor_minmax(w: torch.Tensor,
"""Calculate quantization parameters for the entire tensor using min and
max values."""
- w_min = w.min()
- w_max = w.max()
+ float_w = w.float()
+
+ w_min = float_w.min()
+ w_max = float_w.max()
q_max = 2**n_bits - 1
scales = (w_max - w_min)
scales = scales.clamp_(min=1e-5).div_(q_max)
- zero_points = (-w_min / scales).round()
+ zero_points = precise_round(-w_min / scales)
if return_stats:
return QParams(scales=scales, zero_points=zero_points), (w_min, w_max)
else:
@@ -125,9 +140,10 @@ def cal_qparams_per_tensor_absmax(w: torch.Tensor,
return_stats: bool = False) -> QParams:
"""Calculate quantization parameters for the entire tensor using absolute
max value."""
- absmax = w.abs().max()
+ float_w = w.float()
+ absmax = float_w.abs().max()
q_max = 2**(n_bits - 1) - 1
- scales = absmax.clamp(min=1e-5).div(q_max)
+ scales = absmax.div(q_max)
if return_stats:
return QParams(scales=scales, zero_points=None), absmax
diff --git a/lmdeploy/lite/utils/collect.py b/lmdeploy/lite/utils/collect.py
index 8b2691a4a6..3b66ef6146 100644
--- a/lmdeploy/lite/utils/collect.py
+++ b/lmdeploy/lite/utils/collect.py
@@ -1,7 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Tuple, Union
-from mmengine.config.lazy import LazyAttr
from torch import nn
@@ -22,9 +21,6 @@ def collect_target_modules(model: nn.Module,
A dictionary mapping from module names to module instances.
"""
- if isinstance(target, LazyAttr):
- target = target.build()
-
if not isinstance(target, (type, str)):
raise TypeError('Target must be a string (name of the module) '
'or a type (class of the module)')
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index b3fc86f999..81b8229f6a 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -115,6 +115,7 @@ def update_input_ids(self, input_ids: List[int]):
return input_ids
+@MODELS.register_module(name='wizardlM')
@MODELS.register_module(name='vicuna')
class Vicuna(BaseModel):
"""Chat template of vicuna model."""
@@ -177,15 +178,16 @@ class InternLMChat7B(BaseModel):
def __init__(
self,
- system='<|System|>',
+ system='<|System|>:',
meta_instruction="""You are an AI assistant whose name is InternLM (书生·浦语).
- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
""", # noqa: E501
- user='<|User|>',
- eoh='',
- eoa='',
- assistant='<|Bot|>',
+ user='<|User|>:',
+ eoh='\n',
+ eoa='\n',
+ eosys='\n',
+ assistant='<|Bot|>:',
stop_words=[''],
**kwargs):
super().__init__(**kwargs)
@@ -194,6 +196,7 @@ def __init__(
self.user = user
self.eoh = eoh
self.eoa = eoa
+ self.eosys = eosys
self.assistant = assistant
self.stop_words = stop_words
@@ -211,12 +214,12 @@ def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
- return f'{self.system}:{self.meta_instruction}\n' \
- f'{self.user}:{prompt}{self.eoh}\n' \
- f'{self.assistant}:'
+ return f'{self.system}{self.meta_instruction}{self.eosys}' \
+ f'{self.user}{prompt}{self.eoh}' \
+ f'{self.assistant}'
else:
- return f'\n{self.user}:{prompt}{self.eoh}\n' \
- f'{self.assistant}:'
+ return f'\n{self.user}{prompt}{self.eoh}' \
+ f'{self.assistant}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
@@ -227,17 +230,19 @@ def messages2prompt(self, messages, sequence_start=True):
Returns:
str: the concatenated prompt
"""
+
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
- system, users, assistants = self._translate_messages(messages)
- system = self.meta_instruction if not system else system
- ret = f'{self.system}:{system}\n'
- for user, assistant in zip(users, assistants):
- if assistant:
- ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
- f'{assistant}{self.eoa}\n'
- else:
- ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
+ eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
+ ret = ''
+ if self.meta_instruction:
+ ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
+
+ for message in messages:
+ role = message['role']
+ content = message['content']
+ ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
+ ret += f'{self.assistant}:'
return ret
@@ -386,15 +391,16 @@ def messages2prompt(self, messages, sequence_start=True):
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
- system, users, assistants = self._translate_messages(messages)
- system = self.system if not system else system
- ret = f'{system}{self.meta_instruction}{self.eosys}'
- for user, assistant in zip(users, assistants):
- if assistant:
- ret += f'{self.user}{user}{self.eoh}{self.assistant}' \
- f'{assistant}{self.eoa}'
- else:
- ret += f'{self.user}{user}{self.eoh}{self.assistant}'
+ eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
+ ret = ''
+ if self.meta_instruction:
+ ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+
+ for message in messages:
+ role = message['role']
+ content = message['content']
+ ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
+ ret += f'{self.assistant}'
return ret
@@ -625,6 +631,141 @@ def update_input_ids(self, input_ids: List):
return input_ids
+@MODELS.register_module(name='solar')
+class SOLAR(BaseModel):
+ """Chat template of SOLAR model.
+
+ `https://huggingface.co/upstage/SOLAR-0-70b-16bit`
+ """
+
+ def __init__(self,
+ b_sys='### System:\n',
+ e_sys='\n\n',
+ user='### User:\n',
+ eoh='\n\n',
+ assistant='### Assistant:\n',
+ eoa='\n\n',
+ system='',
+ session_len=2048,
+ **kwargs):
+ super().__init__(**kwargs)
+ self.b_sys = b_sys
+ self.e_sys = e_sys
+ self.user = user
+ self.eoh = eoh
+ self.assistant = assistant
+ self.eoa = eoa
+ self.system = system
+ self.session_len = session_len
+
+ def decorate_prompt(self, prompt, sequence_start=True):
+ """Return the prompt that is concatenated with other elements in the
+ chat template.
+
+ Args:
+ prompt (str): user's input prompt
+ sequence_start (bool): indicator for the first round chat of a
+ session sequence
+ Returns:
+ str: the concatenated prompt
+ """
+ assert self.capability == 'chat', \
+ f'{type(self).__name__} has no capability of {self.capability}'
+ if sequence_start:
+ return f'{self.b_sys}{self.system}{self.e_sys}' \
+ f'{self.user}{prompt}{self.eoh}{self.assistant}'
+
+ return f'{self.user}{prompt}{self.eoh}{self.assistant}'
+
+ def messages2prompt(self, messages, sequence_start=True):
+ """Return the prompt that is concatenated with other elements in the
+ chat template.
+
+ Args:
+ messages (str | List): user's input prompt
+ Returns:
+ str: the concatenated prompt
+ """
+ if isinstance(messages, str):
+ return self.get_prompt(messages, sequence_start)
+ system, users, assistants = self._translate_messages(messages)
+ system = self.system if not system else system
+ ret = f'{self.b_sys}{system}{self.e_sys}'
+ for i, (user, assistant) in enumerate(zip(users, assistants)):
+ ret += f'{self.user}{user}{self.eoh}{self.assistant}'
+ if assistant:
+ ret += f'{assistant}{self.eoa}'
+ return ret
+
+
+@MODELS.register_module(name='ultracm')
+@MODELS.register_module(name='ultralm')
+class UltraChat(BaseModel):
+ """Template of UltraCM and UltraLM models.
+
+ `https://huggingface.co/openbmb/UltraCM-13b`
+ `https://huggingface.co/openbmb/UltraLM-13b`
+ """
+
+ def __init__(
+ self,
+ system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""", # noqa: E501
+ eos='',
+ user='User: ',
+ assistant='Assistant: ',
+ session_len=2048,
+ **kwargs):
+ super().__init__(**kwargs)
+ self.system = system
+ self.eos = eos
+ self.session_len = session_len
+ self.user = user
+ self.assistant = assistant
+
+ def decorate_prompt(self, prompt, sequence_start=True):
+ """Return the prompt that is concatenated with other elements in the
+ chat template.
+
+ Args:
+ prompt (str): the input prompt
+ sequence_start (bool): indicator for the first round chat of a
+ session sequence
+ Returns:
+ str: the concatenated prompt
+ """
+ assert self.capability == 'chat', \
+ f'{type(self).__name__} has no capability of {self.capability}'
+ if sequence_start:
+ return f'{self.system}\n{self.user}{prompt}{self.eos}' \
+ f'\n{self.assistant}'
+
+ return f'\n{self.user}{prompt}{self.eos}' \
+ f'\n{self.assistant}'
+
+ def messages2prompt(self, messages, sequence_start=True):
+ """Return the prompt that is concatenated with other elements in the
+ chat template. Only evaluate the last instruction completion pair.
+
+ Args:
+ messages (str | List): user's input prompt
+ Returns:
+ str: the concatenated prompt
+ """
+ if isinstance(messages, str):
+ return self.get_prompt(messages, sequence_start)
+ system, users, assistants = self._translate_messages(messages)
+ system = self.system if not system else system
+ ret = f'{system}'
+ for user, assistant in zip(users, assistants):
+ if assistant:
+ ret += f'\n{self.user}{user}{self.eos}' \
+ f'\n{self.assistant}{assistant}{self.eos}'
+ else:
+ ret += f'\n{self.user}{user}{self.eos}' \
+ f'\n{self.assistant}'
+ return ret
+
+
def main(model_name: str = 'test'):
assert model_name in MODELS.module_dict.keys(), \
f"'{model_name}' is not supported. " \
@@ -637,4 +778,5 @@ def main(model_name: str = 'test'):
if __name__ == '__main__':
import fire
+
fire.Fire(main)
diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py
index c30cf6ffe9..2690480a8c 100644
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
@@ -51,7 +51,6 @@
import logging
from typing import Optional
-import fire
import torch
from transformers import GenerationConfig, PreTrainedModel
@@ -205,6 +204,8 @@ def main(
def cli():
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/pytorch/modules/linear.py b/lmdeploy/pytorch/modules/linear.py
index bfde0d3d42..218a36407e 100644
--- a/lmdeploy/pytorch/modules/linear.py
+++ b/lmdeploy/pytorch/modules/linear.py
@@ -4,6 +4,11 @@
import torch
from torch import nn
+try:
+ import awq_inference_engine
+except ModuleNotFoundError:
+ awq_inference_engine = None
+
class WeightOnlyQLinear(nn.Module):
"""This class implements weight only quantization linear.
@@ -18,13 +23,15 @@ class WeightOnlyQLinear(nn.Module):
bias (Tensor, optional): Defaults to None.
"""
- def __init__(self,
- w_bit: int,
- symmetry: bool,
- group_size: int,
- in_features: int,
- out_features: int,
- bias: Optional[torch.Tensor] = None) -> None:
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ bias: Optional[torch.Tensor] = True,
+ w_bit: int = 4,
+ symmetry: bool = False,
+ group_size: int = 128,
+ ) -> None:
super().__init__()
if w_bit not in [2, 4, 8]:
@@ -92,8 +99,8 @@ def from_linear(cls: Type['WeightOnlyQLinear'],
out_features = linear.out_features
bias = False if linear.bias is None else True
- qlinear = cls(w_bit, symmetry, group_size, in_features, out_features,
- bias)
+ qlinear = cls(in_features, out_features, bias, w_bit, symmetry,
+ group_size)
qlinear.bias = linear.bias
qparams = quantizer.calculate_qparams(linear.weight)
@@ -124,3 +131,24 @@ def from_linear(cls: Type['WeightOnlyQLinear'],
qlinear.to('cpu')
return qlinear
+
+ @torch.no_grad()
+ def forward(self, x):
+ if awq_inference_engine is None:
+ raise RuntimeError(
+ 'Run the following command to install '
+ 'the kernel for 4bit inference\n\n'
+ 'git clone https://github.com/mit-han-lab/llm-awq.git\n'
+ 'cd awq/kernels\n'
+ 'python setup.py install\n')
+ out_shape = x.shape[:-1] + (self.out_features, )
+ inputs = x.reshape(-1, x.shape[-1])
+
+ out = awq_inference_engine.gemm_forward_cuda(inputs.half(),
+ self.qweight,
+ self.scales.half(),
+ self.qzeros,
+ self.group_size)
+ out = out + self.bias if self.bias is not None else out
+
+ return out.reshape(out_shape)
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
index 9588b00da1..5abae0d97a 100644
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -4,9 +4,7 @@
import os.path as osp
import random
from contextlib import contextmanager
-from typing import Literal, Optional
-
-from lmdeploy.model import MODELS, BaseModel
+from typing import List, Literal, Optional
@dataclasses.dataclass
@@ -28,7 +26,7 @@ class AsyncEngine:
tp (int): tensor parallel
"""
- def __init__(self, model_path, instance_num=32, tp=1) -> None:
+ def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
from lmdeploy import turbomind as tm
from lmdeploy.tokenizer import Tokenizer
tokenizer_model_path = osp.join(model_path, 'triton_models',
@@ -36,18 +34,21 @@ def __init__(self, model_path, instance_num=32, tp=1) -> None:
tokenizer = Tokenizer(tokenizer_model_path)
self.tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
- tp=tp)
+ tp=tp,
+ **kwargs)
self.tokenizer = tokenizer
self.generators = [
self.tm_model.create_instance() for i in range(instance_num)
]
self.instance_num = instance_num
- self.model: BaseModel = MODELS.get(self.tm_model.model_name)()
+ self.model = self.tm_model.model
self.available = [True] * instance_num
self.starts = [None] * instance_num
self.steps = {}
+ self.loop = asyncio.get_event_loop()
def stop_session(self, session_id: int):
+ """Stop a session by a session_id."""
instance_id = session_id % self.instance_num
input_ids = self.tokenizer.encode('')
for outputs in self.generators[instance_id].stream_infer(
@@ -60,8 +61,24 @@ def stop_session(self, session_id: int):
pass
self.available[instance_id] = True
+ def end_session(self, session_id: int):
+ """Clear a session by a session_id."""
+ instance_id = session_id % self.instance_num
+ input_ids = self.tokenizer.encode('')
+ for outputs in self.generators[instance_id].stream_infer(
+ session_id,
+ input_ids,
+ request_output_len=0,
+ sequence_start=False,
+ sequence_end=True,
+ stop=True):
+ pass
+ self.steps[str(session_id)] = 0
+ self.available[instance_id] = True
+
@contextmanager
def safe_run(self, instance_id: int, session_id: Optional[int] = None):
+ """A context manager to make sure server's safe running."""
self.available[instance_id] = False
try:
yield
@@ -82,22 +99,80 @@ async def get_generator(self, instance_id: int, stop: bool = False):
await asyncio.sleep(0.1)
return self.generators[instance_id]
+ def batch_infer(self,
+ prompts: List[str],
+ request_output_len=512,
+ top_k=40,
+ top_p=0.8,
+ temperature=0.8,
+ repetition_penalty=1.0,
+ ignore_eos=False,
+ do_preprocess=True,
+ **kwargs):
+ """Inference a batch of prompts.
+
+ Args:
+ prompts (List[str]): a batch of prompts
+ request_output_len (int): output token nums
+ top_k (int): The number of the highest probability vocabulary
+ tokens to keep for top-k-filtering
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or higher
+ are kept for generation.
+ temperature (float): to modulate the next token probability
+ repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ ignore_eos (bool): indicator for ignoring eos
+ do_preprocess (bool): whether pre-process the messages.
+ """
+ assert isinstance(prompts, List), 'prompts should be a list'
+ batch_size = len(prompts)
+ outputs = [''] * batch_size
+ generators = []
+ for i, prompt in enumerate(prompts):
+ generators.append(
+ self.generate(prompt,
+ i,
+ stream_response=True,
+ sequence_start=True,
+ sequence_end=True,
+ request_output_len=request_output_len,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ ignore_eos=ignore_eos,
+ repetition_penalty=repetition_penalty,
+ do_preprocess=do_preprocess,
+ **kwargs))
+
+ async def _inner_call(i, generator):
+ async for out in generator:
+ outputs[i] += out.response
+
+ async def gather():
+ await asyncio.gather(
+ *[_inner_call(i, generators[i]) for i in range(batch_size)])
+
+ self.loop.run_until_complete(gather())
+ return outputs
+
async def generate(
- self,
- messages,
- session_id,
- stream_response=True,
- sequence_start=True,
- sequence_end=False,
- step=0,
- request_output_len=512,
- stop=False,
- top_k=40,
- top_p=0.8,
- temperature=0.8,
- repetition_penalty=1.0,
- ignore_eos=False,
- ):
+ self,
+ messages,
+ session_id,
+ stream_response=True,
+ sequence_start=True,
+ sequence_end=True, # no interactive mode by default
+ step=0,
+ request_output_len=512,
+ stop=False,
+ top_k=40,
+ top_p=0.8,
+ temperature=0.8,
+ repetition_penalty=1.0,
+ ignore_eos=False,
+ do_preprocess=True,
+ **kwargs):
"""Generate responses.
Args:
@@ -109,15 +184,16 @@ async def generate(
sequence_end (bool): indicator for ending a sequence
step (int): the offset of the k/v cache
stop (bool): whether stop inference
- top_p (float): If set to float < 1, only the smallest set of most
- probable tokens with probabilities that add up to top_p or higher
- are kept for generation.
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or higher
+ are kept for generation.
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
+ do_preprocess (bool): whether pre-process the messages.
"""
instance_id = session_id % self.instance_num
if str(session_id) not in self.steps:
@@ -125,14 +201,18 @@ async def generate(
if step != 0:
self.steps[str(session_id)] = step
seed = random.getrandbits(64)
- prompt = self.model.messages2prompt(messages, sequence_start)
+ prompt = messages
+ if do_preprocess:
+ prompt = self.model.messages2prompt(prompt, sequence_start)
input_ids = self.tokenizer.encode(prompt)
finish_reason = 'stop' if stop else None
if self.steps[str(session_id)] + len(
- input_ids) >= self.tm_model.session_len:
+ input_ids) + request_output_len >= self.tm_model.session_len:
finish_reason = 'length'
yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
finish_reason)
+ if sequence_end is True and sequence_start is False:
+ self.end_session(session_id)
else:
generator = await self.get_generator(instance_id, stop)
with self.safe_run(instance_id, session_id):
@@ -156,6 +236,11 @@ async def generate(
# decode res
response = self.tokenizer.decode(res.tolist(),
offset=response_size)
+ # utf-8 char at the end means it's a potential unfinished
+ # byte sequence, continue to concate it with the next
+ # sequence and decode them together
+ if response.endswith('�'):
+ continue
# response, history token len,
# input token len, gen token len
yield GenOut(response, self.steps[str(session_id)],
@@ -166,93 +251,3 @@ async def generate(
self.steps[str(session_id)] += len(input_ids) + tokens
if sequence_end or stop:
self.steps[str(session_id)] = 0
-
- async def generate_openai(
- self,
- messages,
- instance_id,
- stream_response=True,
- renew_session=False,
- request_output_len=512,
- stop=False,
- top_k=40,
- top_p=0.8,
- temperature=0.8,
- repetition_penalty=1.0,
- ignore_eos=False,
- ):
- """Generate responses.
-
- Args:
- messages (str | List): chat history or prompt
- instance_id (int): actually request host ip
- stream_response (bool): whether return responses streamingly
- renew_session (bool): renew the session
- request_output_len (int): output token nums
- stop (bool): whether stop inference
- top_p (float): If set to float < 1, only the smallest set of most
- probable tokens with probabilities that add up to top_p or higher
- are kept for generation.
- top_k (int): The number of the highest probability vocabulary
- tokens to keep for top-k-filtering
- temperature (float): to modulate the next token probability
- repetition_penalty (float): The parameter for repetition penalty.
- 1.0 means no penalty
- ignore_eos (bool): indicator for ignoring eos
- """
- session_id = instance_id
- instance_id %= self.instance_num
- sequence_start = False
- generator = await self.get_generator(instance_id)
- if renew_session: # renew a session
- empty_input_ids = self.tokenizer.encode('')
- for outputs in generator.stream_infer(session_id=session_id,
- input_ids=[empty_input_ids],
- request_output_len=0,
- sequence_start=False,
- sequence_end=True,
- stop=True):
- pass
- self.steps[str(session_id)] = 0
- if str(session_id) not in self.steps:
- self.steps[str(session_id)] = 0
- if self.steps[str(session_id)] == 0:
- sequence_start = True
- seed = random.getrandbits(64)
- prompt = self.model.messages2prompt(messages, sequence_start)
- input_ids = self.tokenizer.encode(prompt)
- finish_reason = 'stop' if stop else None
- if self.steps[str(session_id)] + len(
- input_ids) >= self.tm_model.session_len:
- finish_reason = 'length'
- yield GenOut('', self.steps[str(session_id)], len(input_ids), 0,
- finish_reason)
- else:
- with self.safe_run(instance_id, session_id):
- response_size = 0
- async for outputs in generator.async_stream_infer(
- session_id=session_id,
- input_ids=[input_ids],
- stream_output=stream_response,
- request_output_len=request_output_len,
- sequence_start=(sequence_start),
- sequence_end=False,
- step=self.steps[str(session_id)],
- stop=stop,
- top_k=top_k,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- ignore_eos=ignore_eos,
- random_seed=seed if sequence_start else None):
- res, tokens = outputs[0]
- # decode res
- response = self.tokenizer.decode(res.tolist(),
- offset=response_size)
- # response, history len, input len, generation len
- yield GenOut(response, self.steps[str(session_id)],
- len(input_ids), tokens, finish_reason)
- response_size = tokens
-
- # update step
- self.steps[str(session_id)] += len(input_ids) + tokens
diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py
index 283e96e299..424e83143f 100644
--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -1,8 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
-import fire
-
from lmdeploy.serve.turbomind.chatbot import Chatbot
@@ -20,7 +18,6 @@ def input_prompt(model_name):
def main(tritonserver_addr: str,
session_id: int = 1,
cap: str = 'chat',
- sys_instruct: str = None,
stream_output: bool = True,
**kwargs):
"""An example to communicate with inference server through the command line
@@ -32,13 +29,11 @@ def main(tritonserver_addr: str,
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infill', 'instruct', 'python']
- sys_instruct (str): the content of 'system' role, which is used by
- conversational model
stream_output (bool): indicator for streaming output or not
**kwargs (dict): other arguments for initializing model's chat template
"""
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
- kwargs.update(capability=cap, system=sys_instruct)
+ kwargs.update(capability=cap)
chatbot = Chatbot(tritonserver_addr,
log_level=log_level,
display=stream_output,
@@ -69,4 +64,6 @@ def main(tritonserver_addr: str,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/gradio/__init__.py b/lmdeploy/serve/gradio/__init__.py
index ef101fec61..770138a44d 100644
--- a/lmdeploy/serve/gradio/__init__.py
+++ b/lmdeploy/serve/gradio/__init__.py
@@ -1 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
+from .api_server_backend import run_api_server
+from .triton_server_backend import run_triton_server
+from .turbomind_coupled import run_local
+
+__all__ = ['run_api_server', 'run_triton_server', 'run_local']
diff --git a/lmdeploy/serve/gradio/api_server_backend.py b/lmdeploy/serve/gradio/api_server_backend.py
new file mode 100644
index 0000000000..8dd92fa0fd
--- /dev/null
+++ b/lmdeploy/serve/gradio/api_server_backend.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+from lmdeploy.serve.openai.api_client import (get_model_list,
+ get_streaming_response)
+
+
+class InterFace:
+ api_server_url: str = None
+ global_session_id: int = 0
+ lock = Lock()
+
+
+def chat_stream_restful(instruction: str, state_chatbot: Sequence,
+ cancel_btn: gr.Button, reset_btn: gr.Button,
+ session_id: int):
+ """Chat with AI assistant.
+
+ Args:
+ instruction (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ session_id (int): the session id
+ """
+ state_chatbot = state_chatbot + [(instruction, None)]
+
+ yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+ for response, tokens, finish_reason in get_streaming_response(
+ instruction,
+ f'{InterFace.api_server_url}/v1/chat/interactive',
+ session_id=session_id,
+ request_output_len=512,
+ interactive_mode=True):
+ if finish_reason == 'length':
+ gr.Warning('WARNING: exceed session max length.'
+ ' Please restart the session by reset button.')
+ if tokens < 0:
+ gr.Warning('WARNING: running on the old session.'
+ ' Please restart the session by reset button.')
+ if state_chatbot[-1][-1] is None:
+ state_chatbot[-1] = (state_chatbot[-1][0], response)
+ else:
+ state_chatbot[-1] = (state_chatbot[-1][0],
+ state_chatbot[-1][1] + response
+ ) # piece by piece
+ yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+ yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
+ session_id: int):
+ """reset the session.
+
+ Args:
+ instruction_txtbox (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ session_id (int): the session id
+ """
+ state_chatbot = []
+ # end the session
+ for response, tokens, finish_reason in get_streaming_response(
+ '',
+ f'{InterFace.api_server_url}/v1/chat/interactive',
+ session_id=session_id,
+ request_output_len=0,
+ interactive_mode=False):
+ pass
+
+ return (
+ state_chatbot,
+ state_chatbot,
+ gr.Textbox.update(value=''),
+ )
+
+
+def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
+ reset_btn: gr.Button, session_id: int):
+ """stop the session.
+
+ Args:
+ instruction_txtbox (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ session_id (int): the session id
+ """
+ yield (state_chatbot, disable_btn, disable_btn)
+ # end the session
+ for out in get_streaming_response(
+ '',
+ f'{InterFace.api_server_url}/v1/chat/interactive',
+ session_id=session_id,
+ request_output_len=0,
+ stop=True):
+ pass
+ time.sleep(0.5)
+ messages = []
+ for qa in state_chatbot:
+ messages.append(dict(role='user', content=qa[0]))
+ if qa[1] is not None:
+ messages.append(dict(role='assistant', content=qa[1]))
+ for out in get_streaming_response(
+ messages,
+ f'{InterFace.api_server_url}/v1/chat/interactive',
+ session_id=session_id,
+ request_output_len=0,
+ interactive_mode=True):
+ pass
+ yield (state_chatbot, disable_btn, enable_btn)
+
+
+def run_api_server(api_server_url: str,
+ server_name: str = 'localhost',
+ server_port: int = 6006,
+ batch_size: int = 32):
+ """chat with AI assistant through web ui.
+
+ Args:
+ api_server_url (str): restufl api url
+ server_name (str): the ip address of gradio server
+ server_port (int): the port of gradio server
+ batch_size (int): batch size for running Turbomind directly
+ """
+ InterFace.api_server_url = api_server_url
+ model_names = get_model_list(f'{api_server_url}/v1/models')
+ model_name = ''
+ if isinstance(model_names, list) and len(model_names) > 0:
+ model_name = model_names[0]
+ else:
+ raise ValueError('gradio can find a suitable model from restful-api')
+
+ with gr.Blocks(css=CSS, theme=THEME) as demo:
+ state_chatbot = gr.State([])
+ state_session_id = gr.State(0)
+
+ with gr.Column(elem_id='container'):
+ gr.Markdown('## LMDeploy Playground')
+
+ chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
+ instruction_txtbox = gr.Textbox(
+ placeholder='Please input the instruction',
+ label='Instruction')
+ with gr.Row():
+ cancel_btn = gr.Button(value='Cancel', interactive=False)
+ reset_btn = gr.Button(value='Reset')
+
+ send_event = instruction_txtbox.submit(chat_stream_restful, [
+ instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
+ state_session_id
+ ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+ instruction_txtbox.submit(
+ lambda: gr.Textbox.update(value=''),
+ [],
+ [instruction_txtbox],
+ )
+ cancel_btn.click(
+ cancel_restful_func,
+ [state_chatbot, cancel_btn, reset_btn, state_session_id],
+ [state_chatbot, cancel_btn, reset_btn],
+ cancels=[send_event])
+
+ reset_btn.click(reset_restful_func,
+ [instruction_txtbox, state_chatbot, state_session_id],
+ [state_chatbot, chatbot, instruction_txtbox],
+ cancels=[send_event])
+
+ def init():
+ with InterFace.lock:
+ InterFace.global_session_id += 1
+ new_session_id = InterFace.global_session_id
+ return new_session_id
+
+ demo.load(init, inputs=None, outputs=[state_session_id])
+
+ print(f'server is gonna mount on: http://{server_name}:{server_port}')
+ demo.queue(concurrency_count=batch_size, max_size=100,
+ api_open=True).launch(
+ max_threads=10,
+ share=True,
+ server_port=server_port,
+ server_name=server_name,
+ )
diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py
index 71db7a2749..5b1668224d 100644
--- a/lmdeploy/serve/gradio/app.py
+++ b/lmdeploy/serve/gradio/app.py
@@ -1,542 +1,41 @@
# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import threading
-import time
-from functools import partial
-from typing import Sequence
-
-import fire
-import gradio as gr
-
-from lmdeploy.serve.async_engine import AsyncEngine
-from lmdeploy.serve.gradio.css import CSS
-from lmdeploy.serve.openai.api_client import (get_model_list,
- get_streaming_response)
-from lmdeploy.serve.openai.api_server import ip2id
-from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-THEME = gr.themes.Soft(
- primary_hue=gr.themes.colors.blue,
- secondary_hue=gr.themes.colors.sky,
- font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
-
-enable_btn = gr.Button.update(interactive=True)
-disable_btn = gr.Button.update(interactive=False)
-
-
-def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
- request: gr.Request):
- """Chat with AI assistant.
-
- Args:
- instruction (str): user's prompt
- state_chatbot (Sequence): the chatting history
- llama_chatbot (Chatbot): the instance of a chatbot
- request (gr.Request): the request from a user
- model_name (str): the name of deployed model
- """
- instruction = state_chatbot[-1][0]
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
-
- bot_response = llama_chatbot.stream_infer(
- session_id, instruction, f'{session_id}-{len(state_chatbot)}')
-
- for status, tokens, _ in bot_response:
- state_chatbot[-1] = (state_chatbot[-1][0], tokens)
- yield (state_chatbot, state_chatbot, '')
-
- return (state_chatbot, state_chatbot, '')
-
-
-def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
- llama_chatbot: gr.State, triton_server_addr: str,
- model_name: str):
- """reset the session."""
- state_chatbot = []
- log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
- llama_chatbot = Chatbot(triton_server_addr,
- model_name,
- log_level=log_level,
- display=True)
-
- return (
- llama_chatbot,
- state_chatbot,
- state_chatbot,
- gr.Textbox.update(value=''),
- )
-
-
-def cancel_func(
- instruction_txtbox: gr.Textbox,
- state_chatbot: gr.State,
- llama_chatbot: gr.State,
-):
- """cancel the session."""
- session_id = llama_chatbot._session.session_id
- llama_chatbot.cancel(session_id)
-
- return (
- llama_chatbot,
- state_chatbot,
- )
-
-
-def add_instruction(instruction, state_chatbot):
- state_chatbot = state_chatbot + [(instruction, None)]
- return ('', state_chatbot)
-
-
-def run_server(triton_server_addr: str,
- server_name: str = 'localhost',
- server_port: int = 6006):
- """chat with AI assistant through web ui.
-
- Args:
- triton_server_addr (str): the communication address of inference server
- server_name (str): the ip address of gradio server
- server_port (int): the port of gradio server
- """
- with gr.Blocks(css=CSS, theme=THEME) as demo:
- log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
- llama_chatbot = gr.State(
- Chatbot(triton_server_addr, log_level=log_level, display=True))
- state_chatbot = gr.State([])
- model_name = llama_chatbot.value.model_name
- reset_all = partial(reset_all_func,
- model_name=model_name,
- triton_server_addr=triton_server_addr)
-
- with gr.Column(elem_id='container'):
- gr.Markdown('## LMDeploy Playground')
-
- chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
- instruction_txtbox = gr.Textbox(
- placeholder='Please input the instruction',
- label='Instruction')
- with gr.Row():
- cancel_btn = gr.Button(value='Cancel')
- reset_btn = gr.Button(value='Reset')
-
- send_event = instruction_txtbox.submit(
- add_instruction, [instruction_txtbox, state_chatbot],
- [instruction_txtbox, state_chatbot]).then(
- chat_stream, [state_chatbot, llama_chatbot],
- [state_chatbot, chatbot])
-
- cancel_btn.click(cancel_func,
- [instruction_txtbox, state_chatbot, llama_chatbot],
- [llama_chatbot, chatbot],
- cancels=[send_event])
-
- reset_btn.click(
- reset_all, [instruction_txtbox, state_chatbot, llama_chatbot],
- [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
- cancels=[send_event])
-
- print(f'server is gonna mount on: http://{server_name}:{server_port}')
- demo.queue(concurrency_count=4, max_size=100, api_open=True).launch(
- max_threads=10,
- share=True,
- server_port=server_port,
- server_name=server_name,
- )
-
-
-# a IO interface mananing variables
-class InterFace:
- async_engine: AsyncEngine = None # for run_local
- restful_api_url: str = None # for run_restful
-
-
-def chat_stream_restful(
- instruction: str,
- state_chatbot: Sequence,
- cancel_btn: gr.Button,
- reset_btn: gr.Button,
- request: gr.Request,
-):
- """Chat with AI assistant.
-
- Args:
- instruction (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- bot_summarized_response = ''
- state_chatbot = state_chatbot + [(instruction, None)]
-
- yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
- f'{bot_summarized_response}'.strip())
-
- for response, tokens, finish_reason in get_streaming_response(
- instruction,
- f'{InterFace.restful_api_url}/generate',
- session_id=session_id,
- request_output_len=512,
- sequence_start=(len(state_chatbot) == 1),
- sequence_end=False):
- if finish_reason == 'length':
- gr.Warning('WARNING: exceed session max length.'
- ' Please restart the session by reset button.')
- if tokens < 0:
- gr.Warning('WARNING: running on the old session.'
- ' Please restart the session by reset button.')
- if state_chatbot[-1][-1] is None:
- state_chatbot[-1] = (state_chatbot[-1][0], response)
- else:
- state_chatbot[-1] = (state_chatbot[-1][0],
- state_chatbot[-1][1] + response
- ) # piece by piece
- yield (state_chatbot, state_chatbot, enable_btn, disable_btn,
- f'{bot_summarized_response}'.strip())
-
- yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
- f'{bot_summarized_response}'.strip())
-
-
-def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
- request: gr.Request):
- """reset the session.
-
- Args:
- instruction_txtbox (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- state_chatbot = []
-
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- # end the session
- for response, tokens, finish_reason in get_streaming_response(
- '',
- f'{InterFace.restful_api_url}/generate',
- session_id=session_id,
- request_output_len=0,
- sequence_start=False,
- sequence_end=True):
- pass
-
- return (
- state_chatbot,
- state_chatbot,
- gr.Textbox.update(value=''),
- )
-
-
-def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button,
- reset_btn: gr.Button, request: gr.Request):
- """stop the session.
-
- Args:
- instruction_txtbox (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- # end the session
- for out in get_streaming_response('',
- f'{InterFace.restful_api_url}/generate',
- session_id=session_id,
- request_output_len=0,
- sequence_start=False,
- sequence_end=False,
- stop=True):
- pass
- time.sleep(0.5)
- messages = []
- for qa in state_chatbot:
- messages.append(dict(role='user', content=qa[0]))
- if qa[1] is not None:
- messages.append(dict(role='assistant', content=qa[1]))
- for out in get_streaming_response(messages,
- f'{InterFace.restful_api_url}/generate',
- session_id=session_id,
- request_output_len=0,
- sequence_start=True,
- sequence_end=False):
- pass
- return (state_chatbot, disable_btn, enable_btn)
-
-
-def run_restful(restful_api_url: str,
- server_name: str = 'localhost',
- server_port: int = 6006,
- batch_size: int = 32):
- """chat with AI assistant through web ui.
-
- Args:
- restful_api_url (str): restufl api url
- server_name (str): the ip address of gradio server
- server_port (int): the port of gradio server
- batch_size (int): batch size for running Turbomind directly
- """
- InterFace.restful_api_url = restful_api_url
- model_names = get_model_list(f'{restful_api_url}/v1/models')
- model_name = ''
- if isinstance(model_names, list) and len(model_names) > 0:
- model_name = model_names[0]
- else:
- raise ValueError('gradio can find a suitable model from restful-api')
-
- with gr.Blocks(css=CSS, theme=THEME) as demo:
- state_chatbot = gr.State([])
-
- with gr.Column(elem_id='container'):
- gr.Markdown('## LMDeploy Playground')
-
- chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
- instruction_txtbox = gr.Textbox(
- placeholder='Please input the instruction',
- label='Instruction')
- with gr.Row():
- cancel_btn = gr.Button(value='Cancel', interactive=False)
- reset_btn = gr.Button(value='Reset')
-
- send_event = instruction_txtbox.submit(
- chat_stream_restful,
- [instruction_txtbox, state_chatbot, cancel_btn, reset_btn],
- [state_chatbot, chatbot, cancel_btn, reset_btn])
- instruction_txtbox.submit(
- lambda: gr.Textbox.update(value=''),
- [],
- [instruction_txtbox],
- )
- cancel_btn.click(cancel_restful_func,
- [state_chatbot, cancel_btn, reset_btn],
- [state_chatbot, cancel_btn, reset_btn],
- cancels=[send_event])
-
- reset_btn.click(reset_restful_func,
- [instruction_txtbox, state_chatbot],
- [state_chatbot, chatbot, instruction_txtbox],
- cancels=[send_event])
-
- print(f'server is gonna mount on: http://{server_name}:{server_port}')
- demo.queue(concurrency_count=batch_size, max_size=100,
- api_open=True).launch(
- max_threads=10,
- share=True,
- server_port=server_port,
- server_name=server_name,
- )
-
-
-async def chat_stream_local(
- instruction: str,
- state_chatbot: Sequence,
- cancel_btn: gr.Button,
- reset_btn: gr.Button,
- request: gr.Request,
-):
- """Chat with AI assistant.
-
- Args:
- instruction (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- bot_summarized_response = ''
- state_chatbot = state_chatbot + [(instruction, None)]
-
- yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
- f'{bot_summarized_response}'.strip())
-
- async for outputs in InterFace.async_engine.generate(
- instruction,
- session_id,
- stream_response=True,
- sequence_start=(len(state_chatbot) == 1)):
- response = outputs.response
- if outputs.finish_reason == 'length':
- gr.Warning('WARNING: exceed session max length.'
- ' Please restart the session by reset button.')
- if outputs.generate_token_len < 0:
- gr.Warning('WARNING: running on the old session.'
- ' Please restart the session by reset button.')
- if state_chatbot[-1][-1] is None:
- state_chatbot[-1] = (state_chatbot[-1][0], response)
- else:
- state_chatbot[-1] = (state_chatbot[-1][0],
- state_chatbot[-1][1] + response
- ) # piece by piece
- yield (state_chatbot, state_chatbot, enable_btn, disable_btn,
- f'{bot_summarized_response}'.strip())
-
- yield (state_chatbot, state_chatbot, disable_btn, enable_btn,
- f'{bot_summarized_response}'.strip())
-
-
-async def reset_local_func(instruction_txtbox: gr.Textbox,
- state_chatbot: gr.State, request: gr.Request):
- """reset the session.
-
- Args:
- instruction_txtbox (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- state_chatbot = []
-
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- # end the session
- async for out in InterFace.async_engine.generate('',
- session_id,
- request_output_len=1,
- stream_response=True,
- sequence_start=False,
- sequence_end=True):
- pass
-
- return (
- state_chatbot,
- state_chatbot,
- gr.Textbox.update(value=''),
- )
-
-
-async def cancel_local_func(state_chatbot: gr.State, cancel_btn: gr.Button,
- reset_btn: gr.Button, request: gr.Request):
- """stop the session.
-
- Args:
- instruction_txtbox (str): user's prompt
- state_chatbot (Sequence): the chatting history
- request (gr.Request): the request from a user
- """
- session_id = threading.current_thread().ident
- if request is not None:
- session_id = ip2id(request.kwargs['client']['host'])
- # end the session
- async for out in InterFace.async_engine.generate('',
- session_id,
- request_output_len=0,
- stream_response=True,
- sequence_start=False,
- sequence_end=False,
- stop=True):
- pass
- messages = []
- for qa in state_chatbot:
- messages.append(dict(role='user', content=qa[0]))
- if qa[1] is not None:
- messages.append(dict(role='assistant', content=qa[1]))
- async for out in InterFace.async_engine.generate(messages,
- session_id,
- request_output_len=0,
- stream_response=True,
- sequence_start=True,
- sequence_end=False):
- pass
- return (state_chatbot, disable_btn, enable_btn)
-
-
-def run_local(model_path: str,
- server_name: str = 'localhost',
- server_port: int = 6006,
- batch_size: int = 4,
- tp: int = 1):
- """chat with AI assistant through web ui.
-
- Args:
- model_path (str): the path of the deployed model
- server_name (str): the ip address of gradio server
- server_port (int): the port of gradio server
- batch_size (int): batch size for running Turbomind directly
- tp (int): tensor parallel for Turbomind
- """
- InterFace.async_engine = AsyncEngine(model_path=model_path,
- instance_num=batch_size,
- tp=tp)
-
- with gr.Blocks(css=CSS, theme=THEME) as demo:
- state_chatbot = gr.State([])
-
- with gr.Column(elem_id='container'):
- gr.Markdown('## LMDeploy Playground')
-
- chatbot = gr.Chatbot(
- elem_id='chatbot',
- label=InterFace.async_engine.tm_model.model_name)
- instruction_txtbox = gr.Textbox(
- placeholder='Please input the instruction',
- label='Instruction')
- with gr.Row():
- cancel_btn = gr.Button(value='Cancel', interactive=False)
- reset_btn = gr.Button(value='Reset')
-
- send_event = instruction_txtbox.submit(
- chat_stream_local,
- [instruction_txtbox, state_chatbot, cancel_btn, reset_btn],
- [state_chatbot, chatbot, cancel_btn, reset_btn])
- instruction_txtbox.submit(
- lambda: gr.Textbox.update(value=''),
- [],
- [instruction_txtbox],
- )
- cancel_btn.click(cancel_local_func,
- [state_chatbot, cancel_btn, reset_btn],
- [state_chatbot, cancel_btn, reset_btn],
- cancels=[send_event])
-
- reset_btn.click(reset_local_func, [instruction_txtbox, state_chatbot],
- [state_chatbot, chatbot, instruction_txtbox],
- cancels=[send_event])
-
- print(f'server is gonna mount on: http://{server_name}:{server_port}')
- demo.queue(concurrency_count=batch_size, max_size=100,
- api_open=True).launch(
- max_threads=10,
- share=True,
- server_port=server_port,
- server_name=server_name,
- )
def run(model_path_or_server: str,
- server_name: str = 'localhost',
+ server_name: str = '0.0.0.0',
server_port: int = 6006,
batch_size: int = 32,
tp: int = 1,
- restful_api: bool = False):
+ **kwargs):
"""chat with AI assistant through web ui.
Args:
model_path_or_server (str): the path of the deployed model or the
- tritonserver URL or restful api URL. The former is for directly
- running service with gradio. The latter is for running with
- tritonserver by default. If the input URL is restful api. Please
- enable another flag `restful_api`.
+ tritonserver URL or restful api URL. For example:
+ - ./workspace
+ - 0.0.0.0:23333
+ - http://0.0.0.0:23333
server_name (str): the ip address of gradio server
server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly
tp (int): tensor parallel for Turbomind
- restufl_api (bool): a flag for model_path_or_server
"""
if ':' in model_path_or_server:
- if restful_api:
- run_restful(model_path_or_server, server_name, server_port,
- batch_size)
+ if 'http:' in model_path_or_server:
+ from lmdeploy.serve.gradio.api_server_backend import run_api_server
+ run_api_server(model_path_or_server, server_name, server_port,
+ batch_size)
else:
- run_server(model_path_or_server, server_name, server_port)
+ from lmdeploy.serve.gradio.triton_server_backend import \
+ run_triton_server
+ run_triton_server(model_path_or_server, server_name, server_port)
else:
+ from lmdeploy.serve.gradio.turbomind_coupled import run_local
run_local(model_path_or_server, server_name, server_port, batch_size,
tp)
if __name__ == '__main__':
+ import fire
+
fire.Fire(run)
diff --git a/lmdeploy/serve/gradio/constants.py b/lmdeploy/serve/gradio/constants.py
new file mode 100644
index 0000000000..891c572e5a
--- /dev/null
+++ b/lmdeploy/serve/gradio/constants.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import gradio as gr
+
+CSS = """
+#container {
+ width: 95%;
+ margin-left: auto;
+ margin-right: auto;
+}
+
+#chatbot {
+ height: 500px;
+ overflow: auto;
+}
+
+.chat_wrap_space {
+ margin-left: 0.5em
+}
+"""
+
+THEME = gr.themes.Soft(
+ primary_hue=gr.themes.colors.blue,
+ secondary_hue=gr.themes.colors.sky,
+ font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
+
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
diff --git a/lmdeploy/serve/gradio/css.py b/lmdeploy/serve/gradio/css.py
deleted file mode 100644
index b3bd233222..0000000000
--- a/lmdeploy/serve/gradio/css.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-CSS = """
-#container {
- width: 95%;
- margin-left: auto;
- margin-right: auto;
-}
-
-#chatbot {
- height: 500px;
- overflow: auto;
-}
-
-.chat_wrap_space {
- margin-left: 0.5em
-}
-"""
diff --git a/lmdeploy/serve/gradio/triton_server_backend.py b/lmdeploy/serve/gradio/triton_server_backend.py
new file mode 100644
index 0000000000..9148903cc5
--- /dev/null
+++ b/lmdeploy/serve/gradio/triton_server_backend.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from functools import partial
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+from lmdeploy.serve.turbomind.chatbot import Chatbot
+
+
+class InterFace:
+ global_session_id: int = 0
+ lock = Lock()
+
+
+def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot,
+ cancel_btn: gr.Button, reset_btn: gr.Button, session_id: int):
+ """Chat with AI assistant.
+
+ Args:
+ instruction (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ llama_chatbot (Chatbot): the instance of a chatbot
+ cancel_btn (bool): enable the cancel button or not
+ reset_btn (bool): enable the reset button or not
+ session_id (int): the session id
+ """
+ instruction = state_chatbot[-1][0]
+
+ bot_response = llama_chatbot.stream_infer(
+ session_id, instruction, f'{session_id}-{len(state_chatbot)}')
+
+ for status, tokens, _ in bot_response:
+ state_chatbot[-1] = (state_chatbot[-1][0], tokens)
+ yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+ yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State,
+ llama_chatbot: gr.State, triton_server_addr: str,
+ model_name: str):
+ """reset the session."""
+ state_chatbot = []
+ log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+ llama_chatbot = Chatbot(triton_server_addr,
+ model_name,
+ log_level=log_level,
+ display=True)
+
+ return (
+ llama_chatbot,
+ state_chatbot,
+ state_chatbot,
+ gr.Textbox.update(value=''),
+ )
+
+
+def cancel_func(
+ state_chatbot: gr.State,
+ llama_chatbot: gr.State,
+ cancel_btn: gr.Button,
+ reset_btn: gr.Button,
+):
+ """cancel the session."""
+ yield (llama_chatbot, state_chatbot, disable_btn, disable_btn)
+ session_id = llama_chatbot._session.session_id
+ llama_chatbot.cancel(session_id)
+
+ yield (llama_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+def add_instruction(instruction, state_chatbot):
+ state_chatbot = state_chatbot + [(instruction, None)]
+ return ('', state_chatbot)
+
+
+def run_triton_server(triton_server_addr: str,
+ server_name: str = 'localhost',
+ server_port: int = 6006):
+ """chat with AI assistant through web ui.
+
+ Args:
+ triton_server_addr (str): the communication address of inference server
+ server_name (str): the ip address of gradio server
+ server_port (int): the port of gradio server
+ """
+ with gr.Blocks(css=CSS, theme=THEME) as demo:
+ log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+ llama_chatbot = gr.State(
+ Chatbot(triton_server_addr, log_level=log_level, display=True))
+ state_chatbot = gr.State([])
+ state_session_id = gr.State(0)
+ model_name = llama_chatbot.value.model_name
+ reset_all = partial(reset_all_func,
+ model_name=model_name,
+ triton_server_addr=triton_server_addr)
+
+ with gr.Column(elem_id='container'):
+ gr.Markdown('## LMDeploy Playground')
+
+ chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
+ instruction_txtbox = gr.Textbox(
+ placeholder='Please input the instruction',
+ label='Instruction')
+ with gr.Row():
+ cancel_btn = gr.Button(value='Cancel', interactive=False)
+ reset_btn = gr.Button(value='Reset')
+
+ send_event = instruction_txtbox.submit(
+ add_instruction, [instruction_txtbox, state_chatbot],
+ [instruction_txtbox, state_chatbot]).then(chat_stream, [
+ state_chatbot, llama_chatbot, cancel_btn, reset_btn,
+ state_session_id
+ ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+
+ cancel_btn.click(cancel_func,
+ [state_chatbot, llama_chatbot, cancel_btn, reset_btn],
+ [llama_chatbot, chatbot, cancel_btn, reset_btn],
+ cancels=[send_event])
+
+ reset_btn.click(
+ reset_all, [instruction_txtbox, state_chatbot, llama_chatbot],
+ [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
+ cancels=[send_event])
+
+ def init():
+ with InterFace.lock:
+ InterFace.global_session_id += 1
+ new_session_id = InterFace.global_session_id
+ return new_session_id
+
+ demo.load(init, inputs=None, outputs=[state_session_id])
+
+ print(f'server is gonna mount on: http://{server_name}:{server_port}')
+ demo.queue(concurrency_count=4, max_size=100, api_open=True).launch(
+ max_threads=10,
+ share=True,
+ server_port=server_port,
+ server_name=server_name,
+ )
diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py
new file mode 100644
index 0000000000..e344abcbda
--- /dev/null
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from threading import Lock
+from typing import Sequence
+
+import gradio as gr
+
+from lmdeploy.serve.async_engine import AsyncEngine
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+
+
+class InterFace:
+ async_engine: AsyncEngine = None
+ global_session_id: int = 0
+ lock = Lock()
+
+
+async def chat_stream_local(
+ instruction: str,
+ state_chatbot: Sequence,
+ cancel_btn: gr.Button,
+ reset_btn: gr.Button,
+ session_id: int,
+):
+ """Chat with AI assistant.
+
+ Args:
+ instruction (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ cancel_btn (gr.Button): the cancel button
+ reset_btn (gr.Button): the reset button
+ session_id (int): the session id
+ """
+ state_chatbot = state_chatbot + [(instruction, None)]
+
+ yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+ async for outputs in InterFace.async_engine.generate(
+ instruction,
+ session_id,
+ stream_response=True,
+ sequence_start=(len(state_chatbot) == 1),
+ sequence_end=False):
+ response = outputs.response
+ if outputs.finish_reason == 'length':
+ gr.Warning('WARNING: exceed session max length.'
+ ' Please restart the session by reset button.')
+ if outputs.generate_token_len < 0:
+ gr.Warning('WARNING: running on the old session.'
+ ' Please restart the session by reset button.')
+ if state_chatbot[-1][-1] is None:
+ state_chatbot[-1] = (state_chatbot[-1][0], response)
+ else:
+ state_chatbot[-1] = (state_chatbot[-1][0],
+ state_chatbot[-1][1] + response
+ ) # piece by piece
+ yield (state_chatbot, state_chatbot, enable_btn, disable_btn)
+
+ yield (state_chatbot, state_chatbot, disable_btn, enable_btn)
+
+
+async def reset_local_func(instruction_txtbox: gr.Textbox,
+ state_chatbot: Sequence, session_id: int):
+ """reset the session.
+
+ Args:
+ instruction_txtbox (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ session_id (int): the session id
+ """
+ state_chatbot = []
+ # end the session
+ async for out in InterFace.async_engine.generate('',
+ session_id,
+ request_output_len=1,
+ stream_response=True,
+ sequence_start=False,
+ sequence_end=True):
+ pass
+ return (state_chatbot, state_chatbot, gr.Textbox.update(value=''))
+
+
+async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
+ reset_btn: gr.Button, session_id: int):
+ """stop the session.
+
+ Args:
+ instruction_txtbox (str): user's prompt
+ state_chatbot (Sequence): the chatting history
+ cancel_btn (gr.Button): the cancel button
+ reset_btn (gr.Button): the reset button
+ session_id (int): the session id
+ """
+ yield (state_chatbot, disable_btn, enable_btn)
+ async for out in InterFace.async_engine.generate('',
+ session_id,
+ request_output_len=0,
+ stream_response=True,
+ sequence_start=False,
+ sequence_end=False,
+ stop=True):
+ pass
+ messages = []
+ for qa in state_chatbot:
+ messages.append(dict(role='user', content=qa[0]))
+ if qa[1] is not None:
+ messages.append(dict(role='assistant', content=qa[1]))
+ async for out in InterFace.async_engine.generate(messages,
+ session_id,
+ request_output_len=0,
+ stream_response=True,
+ sequence_start=True,
+ sequence_end=False):
+ pass
+ yield (state_chatbot, disable_btn, enable_btn)
+
+
+def run_local(model_path: str,
+ server_name: str = 'localhost',
+ server_port: int = 6006,
+ batch_size: int = 4,
+ tp: int = 1):
+ """chat with AI assistant through web ui.
+
+ Args:
+ model_path (str): the path of the deployed model
+ server_name (str): the ip address of gradio server
+ server_port (int): the port of gradio server
+ batch_size (int): batch size for running Turbomind directly
+ tp (int): tensor parallel for Turbomind
+ """
+ InterFace.async_engine = AsyncEngine(model_path=model_path,
+ instance_num=batch_size,
+ tp=tp)
+
+ with gr.Blocks(css=CSS, theme=THEME) as demo:
+ state_chatbot = gr.State([])
+ state_session_id = gr.State(0)
+
+ with gr.Column(elem_id='container'):
+ gr.Markdown('## LMDeploy Playground')
+
+ chatbot = gr.Chatbot(
+ elem_id='chatbot',
+ label=InterFace.async_engine.tm_model.model_name)
+ instruction_txtbox = gr.Textbox(
+ placeholder='Please input the instruction',
+ label='Instruction')
+ with gr.Row():
+ cancel_btn = gr.Button(value='Cancel', interactive=False)
+ reset_btn = gr.Button(value='Reset')
+
+ send_event = instruction_txtbox.submit(chat_stream_local, [
+ instruction_txtbox, state_chatbot, cancel_btn, reset_btn,
+ state_session_id
+ ], [state_chatbot, chatbot, cancel_btn, reset_btn])
+ instruction_txtbox.submit(
+ lambda: gr.Textbox.update(value=''),
+ [],
+ [instruction_txtbox],
+ )
+ cancel_btn.click(
+ cancel_local_func,
+ [state_chatbot, cancel_btn, reset_btn, state_session_id],
+ [state_chatbot, cancel_btn, reset_btn],
+ cancels=[send_event])
+
+ reset_btn.click(reset_local_func,
+ [instruction_txtbox, state_chatbot, state_session_id],
+ [state_chatbot, chatbot, instruction_txtbox],
+ cancels=[send_event])
+
+ def init():
+ with InterFace.lock:
+ InterFace.global_session_id += 1
+ new_session_id = InterFace.global_session_id
+ return new_session_id
+
+ demo.load(init, inputs=None, outputs=[state_session_id])
+
+ print(f'server is gonna mount on: http://{server_name}:{server_port}')
+ demo.queue(concurrency_count=batch_size, max_size=100,
+ api_open=True).launch(
+ max_threads=10,
+ share=True,
+ server_port=server_port,
+ server_name=server_name,
+ )
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index a8718331be..a1610e05ea 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -1,8 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
-from typing import Iterable, List
+from typing import Any, Dict, Iterable, List, Optional, Union
-import fire
import requests
@@ -15,13 +14,306 @@ def get_model_list(api_url: str):
return None
+class APIClient:
+ """Chatbot for LLaMA series models with turbomind as inference engine.
+
+ Args:
+ api_server_url (str): communicating address 'http://:' of
+ api_server
+ """
+
+ def __init__(self, api_server_url: str, **kwargs):
+ self.api_server_url = api_server_url
+ self.chat_intractive_v1_url = f'{api_server_url}/v1/chat/interactive'
+ self.chat_completions_v1_url = f'{api_server_url}/v1/chat/completions'
+ self.completions_v1_url = f'{api_server_url}/v1/completions'
+ self.models_v1_url = f'{api_server_url}/v1/models'
+ self._available_models = None
+
+ @property
+ def available_models(self):
+ """Show available models."""
+ if self._available_models is not None:
+ return self._available_models
+ response = requests.get(self.models_v1_url)
+ if hasattr(response, 'text'):
+ model_list = json.loads(response.text)
+ model_list = model_list.pop('data', [])
+ self._available_models = [item['id'] for item in model_list]
+ return self._available_models
+ return None
+
+ def chat_completions_v1(self,
+ model: str,
+ messages: Union[str, List[Dict[str, str]]],
+ temperature: Optional[float] = 0.7,
+ top_p: Optional[float] = 1.0,
+ n: Optional[int] = 1,
+ max_tokens: Optional[int] = 512,
+ stop: Optional[bool] = False,
+ stream: Optional[bool] = False,
+ presence_penalty: Optional[float] = 0.0,
+ frequency_penalty: Optional[float] = 0.0,
+ user: Optional[str] = None,
+ repetition_penalty: Optional[float] = 1.0,
+ session_id: Optional[int] = -1,
+ ignore_eos: Optional[bool] = False,
+ **kwargs):
+ """Chat completion v1.
+
+ Args:
+ model: model name. Available from self.available_models.
+ messages: string prompt or chat history in OpenAI format.
+ temperature (float): to modulate the next token probability
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or
+ higher are kept for generation.
+ n (int): How many chat completion choices to generate for each
+ input message. Only support one here.
+ stream: whether to stream the results or not. Default to false.
+ max_tokens (int): output token nums
+ repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ ignore_eos (bool): indicator for ignoring eos
+ session_id (int): if not specified, will set random value
+
+ Yields:
+ json objects in openai formats
+ """
+ pload = {
+ k: v
+ for k, v in locals().copy().items()
+ if k[:2] != '__' and k not in ['self']
+ }
+ headers = {'content-type': 'application/json'}
+ response = requests.post(self.chat_completions_v1_url,
+ headers=headers,
+ json=pload,
+ stream=stream)
+ for chunk in response.iter_lines(chunk_size=8192,
+ decode_unicode=False,
+ delimiter=b'\n'):
+ if chunk:
+ if stream:
+ decoded = chunk.decode('utf-8')
+ if decoded == 'data: [DONE]':
+ continue
+ if decoded[:6] == 'data: ':
+ decoded = decoded[6:]
+ output = json.loads(decoded)
+ yield output
+ else:
+ decoded = chunk.decode('utf-8')
+ output = json.loads(decoded)
+ yield output
+
+ def chat_interactive_v1(self,
+ prompt: Union[str, List[Dict[str, str]]],
+ session_id: int = -1,
+ interactive_mode: bool = False,
+ stream: bool = False,
+ stop: bool = False,
+ request_output_len: int = 512,
+ top_p: float = 0.8,
+ top_k: int = 40,
+ temperature: float = 0.8,
+ repetition_penalty: float = 1.0,
+ ignore_eos: bool = False,
+ **kwargs):
+ """Interactive completions.
+
+ - On interactive mode, the chat history is kept on the server. Please
+ set `interactive_mode = True`.
+ - On normal mode, no chat history is kept on the server. Set
+ `interactive_mode = False`.
+
+ Args:
+ prompt: the prompt to use for the generation.
+ session_id: determine which instance will be called.
+ If not specified with a value other than -1, using random value
+ directly.
+ interactive_mode (bool): turn on interactive mode or not. On
+ interactive mode, session history is kept on the server (and
+ vice versa).
+ stream: whether to stream the results or not.
+ stop: whether to stop the session response or not.
+ request_output_len (int): output token nums
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or
+ higher are kept for generation.
+ top_k (int): The number of the highest probability vocabulary
+ tokens to keep for top-k-filtering
+ temperature (float): to modulate the next token probability
+ repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ ignore_eos (bool): indicator for ignoring eos
+
+ Yields:
+ json objects consist of text, tokens, finish_reason
+ """
+ pload = {
+ k: v
+ for k, v in locals().copy().items()
+ if k[:2] != '__' and k not in ['self']
+ }
+ headers = {'content-type': 'application/json'}
+ response = requests.post(self.chat_intractive_v1_url,
+ headers=headers,
+ json=pload,
+ stream=stream)
+ for chunk in response.iter_lines(chunk_size=8192,
+ decode_unicode=False,
+ delimiter=b'\n'):
+ if chunk:
+ decoded = chunk.decode('utf-8')
+ output = json.loads(decoded)
+ yield output
+
+ def completions_v1(
+ self,
+ model: str,
+ prompt: Union[str, List[Any]],
+ suffix: Optional[str] = None,
+ temperature: Optional[float] = 0.7,
+ n: Optional[int] = 1,
+ max_tokens: Optional[int] = 16,
+ stream: Optional[bool] = False,
+ top_p: Optional[float] = 1.0,
+ user: Optional[str] = None,
+ # additional argument of lmdeploy
+ repetition_penalty: Optional[float] = 1.0,
+ session_id: Optional[int] = -1,
+ ignore_eos: Optional[bool] = False,
+ **kwargs):
+ """Chat completion v1.
+
+ Args:
+ model (str): model name. Available from /v1/models.
+ prompt (str): the input prompt.
+ suffix (str): The suffix that comes after a completion of inserted
+ text.
+ max_tokens (int): output token nums
+ temperature (float): to modulate the next token probability
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or
+ higher are kept for generation.
+ n (int): How many chat completion choices to generate for each
+ input message. Only support one here.
+ stream: whether to stream the results or not. Default to false.
+ repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ user (str): A unique identifier representing your end-user.
+ ignore_eos (bool): indicator for ignoring eos
+ session_id (int): if not specified, will set random value
+
+ Yields:
+ json objects in openai formats
+ """
+ pload = {
+ k: v
+ for k, v in locals().copy().items()
+ if k[:2] != '__' and k not in ['self']
+ }
+ headers = {'content-type': 'application/json'}
+ response = requests.post(self.completions_v1_url,
+ headers=headers,
+ json=pload,
+ stream=stream)
+ for chunk in response.iter_lines(chunk_size=8192,
+ decode_unicode=False,
+ delimiter=b'\n'):
+ if chunk:
+ if stream:
+ decoded = chunk.decode('utf-8')[6:]
+ if decoded == 'data: [DONE]':
+ continue
+ if decoded[:6] == 'data: ':
+ decoded = decoded[6:]
+ output = json.loads(decoded)
+ yield output
+ else:
+ decoded = chunk.decode('utf-8')
+ output = json.loads(decoded)
+ yield output
+
+ def chat(self,
+ prompt: str,
+ session_id: int,
+ request_output_len: int = 512,
+ stream: bool = False,
+ top_p: float = 0.8,
+ top_k: int = 40,
+ temperature: float = 0.8,
+ repetition_penalty: float = 1.0,
+ ignore_eos: bool = False):
+ """Chat with a unique session_id.
+
+ Args:
+ prompt: the prompt to use for the generation.
+ session_id: determine which instance will be called.
+ If not specified with a value other than -1, using random value
+ directly.
+ stream: whether to stream the results or not.
+ stop: whether to stop the session response or not.
+ request_output_len (int): output token nums
+ top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or
+ higher are kept for generation.
+ top_k (int): The number of the highest probability vocabulary
+ tokens to keep for top-k-filtering
+ temperature (float): to modulate the next token probability
+ repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ ignore_eos (bool): indicator for ignoring eos
+
+ Yields:
+ text, tokens, finish_reason
+ """
+ assert session_id != -1, 'please set a value other than -1'
+ for outputs in self.chat_interactive_v1(
+ prompt,
+ session_id=session_id,
+ request_output_len=request_output_len,
+ interactive_mode=True,
+ stream=stream,
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ repetition_penalty=repetition_penalty,
+ ignore_eos=ignore_eos):
+ if outputs['finish_reason'] == 'length':
+ print('WARNING: exceed session max length.'
+ ' Please end the session.')
+ yield outputs['text'], outputs['tokens'], outputs['finish_reason']
+
+ def end_session(self, session_id: int):
+ """End the session with a unique session_id.
+
+ Args:
+ session_id: determine which instance will be called.
+ If not specified with a value other than -1, using random value
+ directly.
+ """
+ for out in self.chat_interactive_v1(prompt='',
+ session_id=session_id,
+ request_output_len=0,
+ interactive_mode=False):
+ pass
+
+
+def input_prompt():
+ """Input a prompt in the consolo interface."""
+ print('\ndouble enter to end input >>> ', end='')
+ sentinel = '' # ends when this string is seen
+ return '\n'.join(iter(input, sentinel))
+
+
def get_streaming_response(prompt: str,
api_url: str,
session_id: int,
request_output_len: int = 512,
stream: bool = True,
- sequence_start: bool = True,
- sequence_end: bool = True,
+ interactive_mode: bool = False,
ignore_eos: bool = False,
stop: bool = False) -> Iterable[List[str]]:
headers = {'User-Agent': 'Test Client'}
@@ -30,8 +322,7 @@ def get_streaming_response(prompt: str,
'stream': stream,
'session_id': session_id,
'request_output_len': request_output_len,
- 'sequence_start': sequence_start,
- 'sequence_end': sequence_end,
+ 'interactive_mode': interactive_mode,
'ignore_eos': ignore_eos,
'stop': stop
}
@@ -50,43 +341,26 @@ def get_streaming_response(prompt: str,
yield output, tokens, finish_reason
-def input_prompt():
- """Input a prompt in the consolo interface."""
- print('\ndouble enter to end input >>> ', end='')
- sentinel = '' # ends when this string is seen
- return '\n'.join(iter(input, sentinel))
-
-
-def main(restful_api_url: str, session_id: int = 0):
- nth_round = 1
+def main(api_server_url: str, session_id: int = 0):
+ api_client = APIClient(api_server_url)
while True:
prompt = input_prompt()
- if prompt == 'exit':
- for output, tokens, finish_reason in get_streaming_response(
- '',
- f'{restful_api_url}/generate',
- session_id=session_id,
- request_output_len=0,
- sequence_start=(nth_round == 1),
- sequence_end=True):
- pass
- exit(0)
+ if prompt in ['exit', 'end']:
+ api_client.end_session(session_id)
+ if prompt == 'exit':
+ exit(0)
else:
- for output, tokens, finish_reason in get_streaming_response(
+ for text, tokens, finish_reason in api_client.chat(
prompt,
- f'{restful_api_url}/generate',
session_id=session_id,
request_output_len=512,
- sequence_start=(nth_round == 1),
- sequence_end=False):
+ stream=True):
if finish_reason == 'length':
- print('WARNING: exceed session max length.'
- ' Please end the session.')
continue
- print(output, end='')
-
- nth_round += 1
+ print(text, end='')
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 94271c4b9b..97e5e518c9 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1,10 +1,11 @@
# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
import os
+import random
import time
from http import HTTPStatus
from typing import AsyncGenerator, List, Optional
-import fire
import uvicorn
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
@@ -14,8 +15,10 @@
from lmdeploy.serve.openai.protocol import ( # noqa: E501
ChatCompletionRequest, ChatCompletionResponse,
ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
- ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingsRequest,
- EmbeddingsResponse, ErrorResponse, GenerateRequest, GenerateResponse,
+ ChatCompletionStreamResponse, ChatMessage, CompletionRequest,
+ CompletionResponse, CompletionResponseChoice,
+ CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
+ EmbeddingsRequest, ErrorResponse, GenerateRequest, GenerateResponse,
ModelCard, ModelList, ModelPermission, UsageInfo)
os.environ['TM_LOG_LEVEL'] = 'ERROR'
@@ -105,9 +108,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
1.0 means no penalty
Additional arguments supported by LMDeploy:
- - renew_session (bool): Whether renew the session. Can be used when the
- session length is exceeded.
- ignore_eos (bool): indicator for ignoring eos
+ - session_id (int): if not specified, will set random value
Currently we do not support the following features:
- function_call (Users should implement this by themselves)
@@ -115,20 +117,22 @@ async def chat_completions_v1(request: ChatCompletionRequest,
- presence_penalty (replaced with repetition_penalty)
- frequency_penalty (replaced with repetition_penalty)
"""
- session_id = ip2id(raw_request.client.host)
+ if request.session_id == -1:
+ request.session_id = random.randint(1, 10086)
error_check_ret = await check_request(request)
if error_check_ret is not None:
return error_check_ret
model_name = request.model
- request_id = str(session_id)
+ request_id = str(request.session_id)
created_time = int(time.time())
- result_generator = VariableInterface.async_engine.generate_openai(
+ result_generator = VariableInterface.async_engine.generate(
request.messages,
- session_id,
+ request.session_id,
True, # always use stream to enable batching
- request.renew_session,
+ sequence_start=True,
+ sequence_end=True,
request_output_len=request.max_tokens if request.max_tokens else 512,
stop=request.stop,
top_p=request.top_p,
@@ -189,7 +193,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
async for res in result_generator:
if await raw_request.is_disconnected():
# Abort the request if the client disconnects.
- VariableInterface.async_engine.stop_session(session_id)
+ VariableInterface.async_engine.stop_session(request.session_id)
return create_error_response(HTTPStatus.BAD_REQUEST,
'Client disconnected')
final_res = res
@@ -223,43 +227,191 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
return response
-@app.post('/v1/embeddings')
-async def create_embeddings(request: EmbeddingsRequest,
- raw_request: Request = None):
- """Creates embeddings for the text."""
+@app.post('/v1/completions')
+async def completions_v1(request: CompletionRequest,
+ raw_request: Request = None):
+ """Completion API similar to OpenAI's API.
+
+ Go to `https://platform.openai.com/docs/api-reference/completions/create`
+ for the API specification.
+
+ The request should be a JSON object with the following fields:
+ - model (str): model name. Available from /v1/models.
+ - prompt (str): the input prompt.
+ - suffix (str): The suffix that comes after a completion of inserted text.
+ - max_tokens (int): output token nums
+ - temperature (float): to modulate the next token probability
+ - top_p (float): If set to float < 1, only the smallest set of most
+ probable tokens with probabilities that add up to top_p or higher
+ are kept for generation.
+ - n (int): How many chat completion choices to generate for each input
+ message. Only support one here.
+ - stream: whether to stream the results or not. Default to false.
+ - repetition_penalty (float): The parameter for repetition penalty.
+ 1.0 means no penalty
+ - user (str): A unique identifier representing your end-user.
+
+ Additional arguments supported by LMDeploy:
+ - ignore_eos (bool): indicator for ignoring eos
+ - session_id (int): if not specified, will set random value
+
+ Currently we do not support the following features:
+ - logprobs (not supported yet)
+ - presence_penalty (replaced with repetition_penalty)
+ - frequency_penalty (replaced with repetition_penalty)
+ """
+ if request.session_id == -1:
+ request.session_id = random.randint(1, 10086)
error_check_ret = await check_request(request)
if error_check_ret is not None:
return error_check_ret
- embedding = await VariableInterface.async_engine.get_embeddings(
- request.input)
- data = [{'object': 'embedding', 'embedding': embedding, 'index': 0}]
- token_num = len(embedding)
- return EmbeddingsResponse(
- data=data,
- model=request.model,
- usage=UsageInfo(
- prompt_tokens=token_num,
- total_tokens=token_num,
- completion_tokens=None,
- ),
- ).dict(exclude_none=True)
-
-
-@app.post('/generate')
-async def generate(request: GenerateRequest, raw_request: Request = None):
+ model_name = request.model
+ request_id = str(request.session_id)
+ created_time = int(time.time())
+ if isinstance(request.prompt, str):
+ request.prompt = [request.prompt]
+ generators = []
+ for i in range(len(request.prompt)):
+ result_generator = VariableInterface.async_engine.generate(
+ request.prompt[i],
+ request.session_id + i,
+ True, # always use stream to enable batching
+ sequence_start=True,
+ sequence_end=True,
+ request_output_len=request.max_tokens
+ if request.max_tokens else 512,
+ stop=False,
+ top_p=request.top_p,
+ temperature=request.temperature,
+ repetition_penalty=request.repetition_penalty,
+ ignore_eos=request.ignore_eos,
+ do_preprocess=False)
+ generators.append(result_generator)
+
+ def create_stream_response_json(
+ index: int,
+ text: str,
+ finish_reason: Optional[str] = None,
+ ) -> str:
+ choice_data = CompletionResponseStreamChoice(
+ index=index,
+ text=text,
+ finish_reason=finish_reason,
+ )
+ response = CompletionStreamResponse(
+ id=request_id,
+ created=created_time,
+ model=model_name,
+ choices=[choice_data],
+ )
+ response_json = response.model_dump_json()
+
+ return response_json
+
+ async def completion_stream_generator() -> AsyncGenerator[str, None]:
+ # First chunk with role
+ for generator in generators:
+ for i in range(request.n):
+ choice_data = CompletionResponseStreamChoice(
+ index=i,
+ text='',
+ finish_reason=None,
+ )
+ chunk = CompletionStreamResponse(id=request_id,
+ choices=[choice_data],
+ model=model_name)
+ data = chunk.model_dump_json(exclude_unset=True)
+ yield f'data: {data}\n\n'
+
+ async for res in generator:
+ response_json = create_stream_response_json(
+ index=0,
+ text=res.response,
+ )
+ yield f'data: {response_json}\n\n'
+ yield 'data: [DONE]\n\n'
+
+ # Streaming response
+ if request.stream:
+ return StreamingResponse(completion_stream_generator(),
+ media_type='text/event-stream')
+
+ # Non-streaming response
+ usage = UsageInfo()
+ choices = []
+
+ async def _inner_call(i, generator):
+ final_res = None
+ text = ''
+ async for res in generator:
+ if await raw_request.is_disconnected():
+ # Abort the request if the client disconnects.
+ VariableInterface.async_engine.stop_session(request.session_id)
+ return create_error_response(HTTPStatus.BAD_REQUEST,
+ 'Client disconnected')
+ final_res = res
+ text += res.response
+ assert final_res is not None
+ choice_data = CompletionResponseChoice(
+ index=0,
+ text=text,
+ finish_reason=final_res.finish_reason,
+ )
+ choices.append(choice_data)
+
+ total_tokens = sum([
+ final_res.history_token_len, final_res.input_token_len,
+ final_res.generate_token_len
+ ])
+ usage.prompt_tokens += final_res.input_token_len
+ usage.completion_tokens += final_res.generate_token_len
+ usage.total_tokens += total_tokens
+
+ await asyncio.gather(
+ *[_inner_call(i, generators[i]) for i in range(len(generators))])
+
+ response = CompletionResponse(
+ id=request_id,
+ created=created_time,
+ model=model_name,
+ choices=choices,
+ usage=usage,
+ )
+
+ return response
+
+
+@app.post('/v1/embeddings', tags=['unsupported'])
+async def create_embeddings(request: EmbeddingsRequest,
+ raw_request: Request = None):
+ """Creates embeddings for the text."""
+ return create_error_response(HTTPStatus.BAD_REQUEST,
+ 'Unsupported by turbomind.')
+
+
+@app.post('/generate',
+ tags=['deprecated'],
+ description='please use /v1/chat/interactive')
+@app.post('/v1/chat/interactive')
+async def chat_interactive_v1(request: GenerateRequest,
+ raw_request: Request = None):
"""Generate completion for the request.
+ - On interactive mode, the chat history is kept on the server. Please set
+ `interactive_mode = True`.
+ - On normal mode, no chat history is kept on the server. Set
+ `interactive_mode = False`.
+
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
- session_id: determine which instance will be called. If not specified
- with a value other than -1, using host ip directly.
- - sequence_start (bool): indicator for starting a sequence.
- - sequence_end (bool): indicator for ending a sequence
+ with a value other than -1, using random value directly.
+ - interactive_mode (bool): turn on interactive mode or not. On interactive
+ mode, session history is kept on the server (and vice versa).
- stream: whether to stream the results or not.
- stop: whether to stop the session response or not.
- request_output_len (int): output token nums
- - step (int): the offset of the k/v cache
- top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
@@ -271,15 +423,18 @@ async def generate(request: GenerateRequest, raw_request: Request = None):
- ignore_eos (bool): indicator for ignoring eos
"""
if request.session_id == -1:
- session_id = ip2id(raw_request.client.host)
- request.session_id = session_id
+ request.session_id = random.randint(10087, 23333)
- generation = VariableInterface.async_engine.generate(
+ async_engine = VariableInterface.async_engine
+ sequence_start = async_engine.steps.get(str(request.session_id), 0) == 0
+ sequence_end = not request.interactive_mode
+
+ generation = async_engine.generate(
request.prompt,
request.session_id,
stream_response=True, # always use stream to enable batching
- sequence_start=request.sequence_start,
- sequence_end=request.sequence_end,
+ sequence_start=sequence_start,
+ sequence_end=sequence_end,
request_output_len=request.request_output_len,
top_p=request.top_p,
top_k=request.top_k,
@@ -308,7 +463,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
async for out in generation:
if await raw_request.is_disconnected():
# Abort the request if the client disconnects.
- VariableInterface.async_engine.stop_session(session_id)
+ async_engine.stop_session(request.session_id)
return create_error_response(HTTPStatus.BAD_REQUEST,
'Client disconnected')
text += out.response
@@ -319,14 +474,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
def main(model_path: str,
- server_name: str = 'localhost',
+ server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 32,
tp: int = 1,
allow_origins: List[str] = ['*'],
allow_credentials: bool = True,
allow_methods: List[str] = ['*'],
- allow_headers: List[str] = ['*']):
+ allow_headers: List[str] = ['*'],
+ **kwargs):
"""An example to perform model inference through the command line
interface.
@@ -352,9 +508,12 @@ def main(model_path: str,
VariableInterface.async_engine = AsyncEngine(model_path=model_path,
instance_num=instance_num,
- tp=tp)
+ tp=tp,
+ **kwargs)
uvicorn.run(app=app, host=server_name, port=server_port, log_level='info')
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 756af1a4ca..bee2e2c91c 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -70,7 +70,7 @@ class ChatCompletionRequest(BaseModel):
user: Optional[str] = None
# additional argument of lmdeploy
repetition_penalty: Optional[float] = 1.0
- renew_session: Optional[bool] = False
+ session_id: Optional[int] = -1
ignore_eos: Optional[bool] = False
@@ -135,6 +135,10 @@ class CompletionRequest(BaseModel):
presence_penalty: Optional[float] = 0.0
frequency_penalty: Optional[float] = 0.0
user: Optional[str] = None
+ # additional argument of lmdeploy
+ repetition_penalty: Optional[float] = 1.0
+ session_id: Optional[int] = -1
+ ignore_eos: Optional[bool] = False
class CompletionResponseChoice(BaseModel):
@@ -175,7 +179,7 @@ class CompletionStreamResponse(BaseModel):
class EmbeddingsRequest(BaseModel):
"""Embedding request."""
model: str = None
- input: Union[str, List[Any]]
+ input: Union[str, List[str]]
user: Optional[str] = None
@@ -191,8 +195,7 @@ class GenerateRequest(BaseModel):
"""Generate request."""
prompt: Union[str, List[Dict[str, str]]]
session_id: int = -1
- sequence_start: bool = True
- sequence_end: bool = False
+ interactive_mode: bool = False
stream: bool = False
stop: bool = False
request_output_len: int = 512
diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py
index cc12fcff3b..5b89cc506a 100644
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -459,6 +459,10 @@ def _stream_infer(self,
session.sequence_length = 0
input_ids, input_lengths = self.preprocess(prompt)
+ # will crash if last_token_id == eos_id and send empty input_ids
+ if sequence_end and request_output_len == 0:
+ input_ids = np.array([[self.bos_id]], dtype=np.uint32)
+ input_lengths = np.array([[1]], dtype=np.uint32)
input_tokens = input_lengths.squeeze()
if self.profile_generation:
yield StatusCode.TRITON_STREAM_ING, \
@@ -657,8 +661,13 @@ def stream_consumer(postprocess, res_queue, session, n_input_token,
continue
output_str = postprocess(
output_ids, np.array([[n_token]], dtype=np.uint32))
- n_token = output_ids.shape[-1]
text = output_str[0].decode()
+ # utf-8 char at the end means it's a potential unfinished
+ # byte sequence, continue to concate it with the next
+ # sequence and decode them together
+ if text.endswith('�'):
+ continue
+ n_token = output_ids.shape[-1]
if display:
print(text, end='', flush=True)
session.response += text
diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py
deleted file mode 100644
index cc8db88f5c..0000000000
--- a/lmdeploy/serve/turbomind/deploy.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import configparser
-import json
-import os
-import os.path as osp
-import re
-import shutil
-import sys
-from pathlib import Path
-
-import fire
-import safetensors
-import torch
-from safetensors.torch import load_file
-from sentencepiece import SentencePieceProcessor
-
-import lmdeploy
-from lmdeploy.model import MODELS
-
-supported_formats = ['llama', 'hf', 'awq', 'qwen']
-
-
-def get_package_root_path():
- import lmdeploy
- return Path(lmdeploy.__file__).parent
-
-
-def create_workspace(_path: str):
- """Create a workspace.
-
- Args:
- _path (str): the path of the workspace
- Returns:
- bool: success or not
- """
- try:
- if osp.exists(_path):
- shutil.rmtree(_path)
- os.makedirs(_path)
- print(f'create workspace in directory {_path}')
- return True
- except Exception as e:
- print(f'create workspace in {_path} failed: {e}')
- return False
-
-
-def destroy_workspace(_path: str):
- """destroy workspace.
-
- Args:
- _path(str): the path of the workspace
- Returns:
- bool: success or not
- """
- try:
- shutil.rmtree(_path)
- print(f'destroy workspace in directory {_path}')
- return True
- except Exception as e:
- print(f'destroy workspace in {_path} failed: {e}')
- return False
-
-
-def copy_triton_model_templates(_path: str):
- """copy triton model templates to the specified path.
-
- Args:
- _path (str): the target path
- Returns:
- str: the path of the triton models
- """
- try:
- cur_path = osp.abspath(__file__)
- dir_path = osp.dirname(cur_path)
- triton_models_path = osp.join(dir_path, 'triton_models')
- dst_path = osp.join(_path, 'triton_models')
- shutil.copytree(triton_models_path, dst_path, symlinks=True)
- print(f'copy triton model templates from "{triton_models_path}" to '
- f'"{dst_path}" successfully')
- shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
- return dst_path
- except Exception as e:
- print(f'copy triton model templates from "{triton_models_path}"'
- f' to "{dst_path}" failed: {e}')
- return None
-
-
-def tokenizer_info_sp(model_path: str):
- """Return the vocabulary size, bos token id and eos token id.
-
- Args:
- model_path (str): the tokenizer model's path
- Returns:
- tuple: vocabulary size, bos token id and eos token id
- """
- assert os.path.isfile(model_path), model_path
- sp_model = SentencePieceProcessor(model_file=model_path)
- # BOS / EOS token IDs
- n_words = sp_model.vocab_size()
- bos_id = sp_model.bos_id()
- eos_id = sp_model.eos_id()
- return n_words, bos_id, eos_id
-
-
-def tokenizer_info_qwen(model_dir: str):
- n_words = 151851
- bos_id = 0
- eos_id = 151643
- return n_words, bos_id, eos_id
-
-
-def load_checkpoint(model_path):
- """Load checkpoint files into torch format.
-
- Args:
- model_path (str): the checkpoint folder
- Returns:
- Dict[str, torch.Tensor]: weight in torch format
- """
- suffixes = ['.safetensors', '.bin']
- for suffix in suffixes:
- files = [
- file for file in os.listdir(model_path) if file.endswith(suffix)
- ]
- if len(files) > 0:
- break
-
- assert len(files) > 0, f'could not find checkpoints in {model_path}'
- files = sorted(files)
- print(files)
- params = {}
- for file in files:
- if file.endswith('.bin'):
- tmp = torch.load(osp.join(model_path, file), map_location='cpu')
- else:
- tmp = load_file(osp.join(model_path, file))
- params.update(tmp)
- return params
-
-
-def export(model_name: str,
- num_layer: int,
- norm_eps: float,
- kv_head_num: int,
- model_params: dict,
- tokenizer_path: str,
- out_dir: str,
- tp: int,
- size_per_head: int = 128,
- group_size: int = 0,
- weight_type: str = 'fp16',
- max_position_embeddings: int = 0,
- use_dynamic_ntk: int = 0,
- use_logn_attn: int = 0,
- rope_theta: float = 10000.0,
- tokenizer_info=tokenizer_info_sp):
- """Export deploying information to a config file.
-
- Args:
- model_name (str): model's name
- num_layer (int): the number of transformer blocks
- norm_eps (float): norm epsilon
- model_params (dict): parameters of a model
- tokenizer_path (str): the tokenizer model's path
- out_dir (str): the path of the output directory
- tp (int): the number of tensor parallelism
- size_per_head (int): the dimension of each head
- """
- out_dir = osp.join(out_dir, 'weights')
- os.makedirs(out_dir, exist_ok=True)
-
- def save_bin(param: torch.Tensor, name):
- print(name, param.shape)
- if param.dtype in [torch.float, torch.bfloat16]:
- param = param.half()
- param.contiguous().cpu().numpy().tofile(osp.join(out_dir, name))
-
- attn_bias = False
- inter_size = 0
-
- tok_embeddings = model_params['tok_embeddings.weight']
- _vocab_size, dim = tok_embeddings.shape
- head_num = dim // size_per_head
- if _vocab_size % tp != 0:
- # Resolve https://github.com/InternLM/lmdeploy/issues/266
- # Pad tok_embeddings and output weights, making their shape divisible by TP # noqa: E501
- pad_size = (_vocab_size + tp - 1) // tp * tp - _vocab_size
- # Pad weight at the bottom of dim 0
- model_params['tok_embeddings.weight'] = torch.nn.functional.pad(
- tok_embeddings, (0, 0, 0, pad_size), 'constant', 0)
- # Pad output weight at the bottom of dim 0
- model_params['output.weight'] = torch.nn.functional.pad(
- model_params['output.weight'], (0, 0, 0, pad_size), 'constant', 0)
-
- # reverse the splitting axes since the weights are transposed above
- for param_name, param_data in model_params.items():
- split_dim = None
- key, ext = param_name.split('.')[-2:]
- if key == 'w_qkv' and ext == 'bias':
- attn_bias = True
- copy = False
- if key in ['w1', 'w3', 'w13', 'w_qkv']:
- split_dim = -1
- # TODO: move parameter extraction outside of the loop
- if key == 'w1':
- inter_size = max(inter_size, param_data.shape[-1])
- elif key == 'w13':
- inter_size = max(inter_size, param_data.shape[-1] // 2)
- elif key in ['w2', 'wo']:
- if ext in ['bias']:
- copy = True
- else:
- split_dim = 0
- if split_dim is not None:
- print(f'*** splitting {param_name}, shape={param_data.shape}, '
- f'split_dim={split_dim}')
- assert param_data.shape[split_dim] % tp == 0
- split_size = param_data.shape[split_dim] // tp
- splits = torch.split(param_data, split_size, dim=split_dim)
- for i, split in enumerate(splits):
- prefix, ext = osp.splitext(param_name)
- save_bin(split, f'{prefix}.{i}{ext}')
- elif copy:
- print(f'### copying {param_name}, shape={param_data.shape}')
- copies = [param_data] * tp
- for i, copy in enumerate(copies):
- prefix, ext = osp.splitext(param_name)
- save_bin(copy, f'{prefix}.{i}{ext}')
- else:
- save_bin(param_data, param_name)
-
- assert inter_size > 0
-
- # export config and save it to {out_dir}/config.ini
- model = MODELS.get(model_name)()
- vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path)
- assert _vocab_size >= vocab_size, \
- f'different vocab size {_vocab_size} vs {vocab_size}'
- cfg = dict(llama=dict(
- model_name=model_name,
- head_num=head_num,
- kv_head_num=kv_head_num,
- size_per_head=size_per_head,
- vocab_size=_vocab_size,
- num_layer=num_layer,
- rotary_embedding=size_per_head,
- rope_theta=rope_theta,
- inter_size=inter_size,
- norm_eps=norm_eps,
- attn_bias=int(attn_bias),
- start_id=bos_id,
- end_id=eos_id,
- weight_type=weight_type,
- group_size=group_size,
- # parameters for turbomind
- max_batch_size=32,
- max_context_token_num=4,
- session_len=model.session_len + 8,
- step_length=1,
- cache_max_entry_count=48,
- cache_chunk_size=1,
- use_context_fmha=1,
- quant_policy=0,
- tensor_para_size=tp,
- # extra attention params
- max_position_embeddings=max_position_embeddings,
- use_dynamic_ntk=int(use_dynamic_ntk),
- use_logn_attn=int(use_logn_attn),
- ))
-
- config = configparser.ConfigParser()
- for section, key_values in cfg.items():
- config[section] = key_values
-
- config_path = osp.join(out_dir, 'config.ini')
- with open(config_path, 'w') as f:
- config.write(f)
- return True
-
-
-def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
- dim: int):
-
- def reshape(x):
- return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
-
- qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
-
- # (input_dim, head_num + 2 * kv_head_num)
- return qkv.view(q.size(0), -1)
-
-
-def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
- triton_models_path: str, tp: int):
- """Deploy a model with huggingface transformers' format.
-
- Args:
- model_name (str): the name of the to-be-deployed model
- model_path (str): the path of the directory where the model weight
- files are
- tokenizer_path (str): the path of the tokenizer model path
- triton_models_path (str): the path of the exported triton models
- tp (int): the number of tensor parallelism
- """
- if osp.exists(tokenizer_path):
- shutil.copy(tokenizer_path,
- osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
- with get_package_root_path() as root_path:
- shutil.copy(osp.join(root_path, 'tokenizer.py'),
- osp.join(triton_models_path, 'tokenizer'))
- else:
- print(f'tokenizer model {tokenizer_path} does not exist')
- return False
- # read model arguments from params.json
- try:
- params_path = osp.join(model_path, 'params.json')
- with open(params_path) as f:
- model_arg = json.load(f)
- num_layer = model_arg['n_layers']
- norm_eps = model_arg['norm_eps']
- head_num = model_arg.get('n_heads', 32)
- kv_head_num = model_arg.get('n_kv_heads', head_num)
- except Exception as e:
- print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}')
- return False
-
- # convert weights from llama to turbomind format
- checkpoints = []
- for pattern in ['*.pth', '*.pt']:
- checkpoints += sorted(Path(model_path).glob(pattern))
- print(checkpoints)
- n_ckpt = len(checkpoints)
- model_params = {}
-
- def get_param(_name, _size):
- print(_name, _size)
- if _name not in model_params:
- model_params[_name] = torch.zeros(_size,
- dtype=torch.float16,
- device='cpu')
- return model_params[_name]
-
- for i, ckpt_path in enumerate(checkpoints):
- ckpt = torch.load(ckpt_path, map_location='cpu')
- for param_name, param_data in ckpt.items():
- key, ext = param_name.split('.')[-2:]
- # column-parallel
- if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
- size = param_data.size(0)
- if ext == 'weight':
- param = get_param(
- param_name,
- [size * n_ckpt, param_data.size(1)])
- param.data[size * i:size * (i + 1), :] = param_data
- else: # bias
- param = get_param(param_name, [size * n_ckpt])
- param.data[size * i:size * (i + 1)] = param_data
- # row-parallel
- elif key in ['w2', 'wo', 'tok_embeddings']:
- size = param_data.size(-1)
- if ext == 'weight':
- param = get_param(param_name,
- [param_data.size(0), size * n_ckpt])
- param.data[:, size * i:size * (i + 1)] = param_data
- else: # bias
- param = get_param(param_name, [size])
- param.data = param_data
- elif i == 0:
- param = get_param(param_name, param_data.size())
- param.data = param_data
- del ckpt
-
- for name, param in model_params.items():
- # transpose all weights as TurboMind is expecting column-major
- # weights: (output_dims, input_dims) -> (input_dims, output_dims)
- key = name.split('.')[-2]
- if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
- param.data = param.data.t()
-
- # concat qkv projection
- for t in ['weight', 'bias']:
- for i in range(1000):
- _qkv = [
- f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv']
- ]
- try:
- qkv = tuple(map(model_params.pop, _qkv))
- except KeyError:
- break
- # concat by heads
- qkv = merge_qkv(*qkv, tp, dim=2 if t == 'weight' else 1)
- print(f'layers.{i}.attention.w_qkv.{t}', qkv.shape)
- model_params[f'layers.{i}.attention.w_qkv.{t}'] = qkv
-
- assert i == 0 or num_layer == i, f'miss matched layers: {num_layer} vs {i}'
-
- return export(model_name, num_layer, norm_eps, kv_head_num, model_params,
- tokenizer_path, triton_models_path, tp)
-
-
-def permute(x: torch.Tensor):
- SIZE_PER_HEAD = 128
- if x.shape[-1] > 1:
- dim = x.shape[-1]
- n_heads = dim // SIZE_PER_HEAD
- return x.view(-1, n_heads, 2,
- dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
- else: # scales, zeros
- dim = x.shape[0]
- n_heads = dim // SIZE_PER_HEAD
- return x.view(n_heads, 2, dim // n_heads // 2,
- 1).transpose(1, 2).reshape(dim, 1)
-
-
-def deploy_hf(model_name: str, model_path: str, tokenizer_path: str,
- triton_models_path: str, tp: int):
- """Deploy a model with huggingface transformers' format.
-
- Args:
- model_name (str): the name of the to-be-deployed model
- model_path (str): the path of the directory where the model weight
- files are
- tokenizer_path (str): the path of the tokenizer model path
- triton_models_path (str): the path of the exported triton models
- tp (int): the number of tensor parallelism
- """
- if tokenizer_path is None:
- tokenizer_path = osp.join(model_path, 'tokenizer.model')
- if osp.exists(tokenizer_path):
- shutil.copy(tokenizer_path,
- osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
- for _file in os.listdir(model_path):
- if _file.endswith('.json') or _file.endswith('.py'):
- json_path = osp.join(model_path, _file)
- shutil.copy(json_path,
- osp.join(triton_models_path, 'tokenizer', _file))
- with get_package_root_path() as root_path:
- shutil.copy(osp.join(root_path, 'tokenizer.py'),
- osp.join(triton_models_path, 'tokenizer'))
- else:
- print(f'tokenizer model {tokenizer_path} does not exist')
- exit(-1)
-
- # read model arguments from params.json
- try:
- params_path = osp.join(model_path, 'config.json')
- with open(params_path) as f:
- model_arg = json.load(f)
- num_layer = model_arg['num_hidden_layers']
- norm_eps = model_arg['rms_norm_eps']
- rope_theta = float(model_arg.get('rope_theta', 10000.0))
- max_position_embeddings = int(
- model_arg.get('max_position_embeddings', 0))
- repo_scaling = bool(model_arg.get('rope_scaling', False))
- if 'num_key_value_heads' in model_arg:
- kv_head_num = model_arg['num_key_value_heads']
- else:
- kv_head_num = model_arg['num_attention_heads']
- except Exception as e:
- print(f'get "num_hidden_layers" and "rms_norm_eps" from '
- f'{params_path} failed: {e}')
- return False
-
- # convert weights from hf to turbomind
- model_params = {}
-
- _qweight = 'weight'
- _suffixes = [_qweight, 'bias']
-
- _params = load_checkpoint(model_path)
-
- def get_tensor(name):
- """return tensor according its name."""
- return _params[name]
-
- def get_tensor_transposed(name: str):
- """return a transposed tensor according its name."""
- if name not in _params and name.find('bias'):
- return None
- return _params[name].t()
-
- w_pack = False
- if 'model.layers.0.self_attn.W_pack.weight' in _params:
- w_pack = True
-
- for i in range(1000):
- try:
- # attention weights
- for suffix in _suffixes:
- if w_pack:
- _qkvo = [
- f'model.layers.{i}.self_attn.{t}'
- for t in ['W_pack', 'o_proj']
- ]
- qkv, o = map(get_tensor_transposed,
- map(('{}.' + suffix).format, _qkvo))
-
- if qkv is None:
- continue
- _shape = qkv.shape[1] // 3
- _qkv = torch.split(qkv, [_shape, _shape, _shape], dim=1)
- q = _qkv[0]
- k = _qkv[1]
- v = _qkv[2]
-
- else:
- _qkvo = [
- f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo'
- ]
- q, k, v, o = map(get_tensor_transposed,
- map(('{}.' + suffix).format, _qkvo))
- if q is None:
- continue
- # q, k has different layout for fb & hf, convert to fb's
- # layout
- q = permute(q)
- k = permute(k)
- if suffix == _qweight: # weight, qweight
- qkv = merge_qkv(q, k, v, tp, dim=2)
- print(suffix, qkv.shape)
- else: # scales, zeros, bias
- qkv = merge_qkv(q, k, v, tp, dim=1)
- print(suffix, qkv.shape)
- for k, v in [('w_qkv', qkv), ('wo', o)]:
- model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
- # ffn weights
- _w123 = [
- f'model.layers.{i}.mlp.{t}_proj'
- for t in ['gate', 'down', 'up']
- ]
- for suffix in _suffixes:
- w1, w2, w3 = map(get_tensor_transposed,
- map(('{}.' + suffix).format, _w123))
- if w1 is None:
- continue
- if suffix in ['scales', 'zeros', 'bias']:
- w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
- for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
- model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
- other = [('attention_norm.weight', 'input_layernorm.weight'),
- ('ffn_norm.weight', 'post_attention_layernorm.weight')]
- for ft, hf in other:
- model_params[f'layers.{i}.' +
- ft] = get_tensor(f'model.layers.{i}.' + hf)
- except safetensors.SafetensorError:
- break
- except KeyError:
- break
-
- assert num_layer == i, f'miss matched layers: {num_layer} vs {i}'
-
- other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
- ('norm.weight', 'model.norm.weight'),
- ('output.weight', 'lm_head.weight')]
- for ft, hf in other:
- model_params[ft] = get_tensor(hf)
-
- if model_name == 'baichuan2-7b':
- # https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py#L507
- # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
- model_params['output.weight'] = torch.nn.functional.normalize(
- model_params['output.weight'])
-
- return export(model_name,
- num_layer,
- norm_eps,
- kv_head_num,
- model_params,
- tokenizer_path,
- triton_models_path,
- tp,
- max_position_embeddings=max_position_embeddings,
- use_dynamic_ntk=repo_scaling,
- rope_theta=rope_theta)
-
-
-def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
- triton_models_path: str, tp: int, quant_path: str,
- group_size: int):
- """Deploy a model with huggingface transformers' format.
-
- Args:
- model_name (str): the name of the to-be-deployed model
- model_path (str): the path of the directory where the model weight
- files are
- tokenizer_path (str): the path of the tokenizer model path
- triton_models_path (str): the path of the exported triton models
- tp (int): the number of tensor parallelism
- quant_path (str): path of the quantized model, which can be None
- group_size (int): a parameter used in AWQ to quantize fp16 weights
- to 4 bits
- """
- if tokenizer_path is None:
- tokenizer_path = osp.join(model_path, 'tokenizer.model')
- if osp.exists(tokenizer_path):
- shutil.copy(tokenizer_path,
- osp.join(triton_models_path, 'tokenizer/tokenizer.model'))
- for _file in os.listdir(model_path):
- if _file.endswith('.json') or _file.endswith('.py'):
- json_path = osp.join(model_path, _file)
- shutil.copy(json_path,
- osp.join(triton_models_path, 'tokenizer', _file))
- with get_package_root_path() as root_path:
- shutil.copy(osp.join(root_path, 'tokenizer.py'),
- osp.join(triton_models_path, 'tokenizer'))
- else:
- print(f'tokenizer model {tokenizer_path} does not exist')
- exit(-1)
-
- # read model arguments from params.json
- try:
- params_path = osp.join(model_path, 'config.json')
- with open(params_path) as f:
- model_arg = json.load(f)
- num_layer = model_arg['num_hidden_layers']
- norm_eps = model_arg['rms_norm_eps']
- rope_theta = float(model_arg.get('rope_theta', 10000.0))
- if 'num_key_value_heads' in model_arg:
- kv_head_num = model_arg['num_key_value_heads']
- else:
- kv_head_num = model_arg['num_attention_heads']
- except Exception as e:
- print(f'get "num_hidden_layers" and "rms_norm_eps" from '
- f'{params_path} failed: {e}')
- return False
-
- # convert weights from hf to turbomind
- if quant_path is None:
- _files = [
- osp.join(model_path, file) for file in os.listdir(model_path)
- if file.endswith('.bin')
- ]
- _files = sorted(_files)
- else:
- _files = [quant_path]
-
- model_params = {}
-
- _params = {}
- for _file in _files:
- _tmp = torch.load(_file, map_location='cpu')
- _params.update(_tmp)
-
- def get_tensor(name):
- """return tensor according its name."""
- return _params[name].cuda().contiguous()
-
- # import _turbomind as _tm
- # TODO: find another way import _turbomind
- lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
- sys.path.append(osp.join(lmdeploy_dir, 'lib'))
- import _turbomind as _tm # noqa: E402
-
- def transpose_qk_s4(src: torch.Tensor):
- assert src.is_contiguous()
- dst = torch.zeros_like(src)
- _tm.transpose_qk_s4_k_m8(src, dst,
- src.size(-1) * 8, src.size(0), group_size)
- return dst
-
- def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor,
- w1_s: torch.Tensor, w3_qw: torch.Tensor,
- w3_qz: torch.Tensor, w3_s: torch.Tensor):
-
- def fuse(a: torch.Tensor, b: torch.Tensor):
- ab = torch.cat((a, b)).contiguous()
- _ab = torch.zeros_like(ab)
- _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
- return _ab.view(a.size(0), -1)
-
- w13_qw = fuse(w1_qw, w3_qw)
- w13_qz = fuse(w1_qz, w3_qz)
-
- w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
- w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
-
- return w13_qw, w13_qz, w13_s
-
- def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
- group_size: int):
- assert qw.is_contiguous()
- assert qz.is_contiguous()
- assert s.is_contiguous()
- _qw = torch.zeros_like(qw)
- _sz = torch.zeros_like(s, dtype=torch.int32) # half2
- _ws = torch.zeros_like(s)
- _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
- qw.size(-1) * 8, qw.size(0), group_size)
- return _qw, _sz
-
- def tp_m_s4(x: torch.Tensor, tp: int):
- return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
- 1).contiguous()
-
- attn_bias = False
-
- for i in range(num_layer):
- print(i)
-
- # attention weights
- q_qw = get_tensor(f'model.layers.{i}.self_attn.q_proj.qweight')
- k_qw = get_tensor(f'model.layers.{i}.self_attn.k_proj.qweight')
- v_qw = get_tensor(f'model.layers.{i}.self_attn.v_proj.qweight')
- o_qw = get_tensor(f'model.layers.{i}.self_attn.o_proj.qweight')
-
- q_qz = get_tensor(f'model.layers.{i}.self_attn.q_proj.qzeros')
- k_qz = get_tensor(f'model.layers.{i}.self_attn.k_proj.qzeros')
- v_qz = get_tensor(f'model.layers.{i}.self_attn.v_proj.qzeros')
- o_qz = get_tensor(f'model.layers.{i}.self_attn.o_proj.qzeros')
-
- q_s = get_tensor(f'model.layers.{i}.self_attn.q_proj.scales')
- k_s = get_tensor(f'model.layers.{i}.self_attn.k_proj.scales')
- v_s = get_tensor(f'model.layers.{i}.self_attn.v_proj.scales')
- o_s = get_tensor(f'model.layers.{i}.self_attn.o_proj.scales')
-
- try:
- q_b = get_tensor(f'model.layers.{i}.self_attn.q_proj.bias')
- k_b = get_tensor(f'model.layers.{i}.self_attn.k_proj.bias')
- v_b = get_tensor(f'model.layers.{i}.self_attn.v_proj.bias')
- o_b = get_tensor(f'model.layers.{i}.self_attn.o_proj.bias')
- attn_bias = True
- except: # noqa: E722
- pass
-
- q_qw = transpose_qk_s4(q_qw)
- k_qw = transpose_qk_s4(k_qw)
- q_qz = transpose_qk_s4(q_qz)
- k_qz = transpose_qk_s4(k_qz)
- q_s = permute(q_s)
- k_s = permute(k_s)
-
- qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
- qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
- qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
-
- qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
-
- qkv_qw = tp_m_s4(qkv_qw, tp)
-
- model_params[f'layers.{i}.attention.w_qkv.qweight'] = qkv_qw
- model_params[f'layers.{i}.attention.w_qkv.scales_zeros'] = qkv_sz
-
- o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
-
- model_params[f'layers.{i}.attention.wo.qweight'] = o_qw
- model_params[f'layers.{i}.attention.wo.scales_zeros'] = o_sz
-
- if attn_bias:
- q_b = permute(q_b)
- k_b = permute(k_b)
- qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
- model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
- model_params[f'layers.{i}.attention.wo.bias'] = o_b
-
- # ffn weights
- w1_qw = get_tensor(f'model.layers.{i}.mlp.gate_proj.qweight')
- w2_qw = get_tensor(f'model.layers.{i}.mlp.down_proj.qweight')
- w3_qw = get_tensor(f'model.layers.{i}.mlp.up_proj.qweight')
-
- w1_qz = get_tensor(f'model.layers.{i}.mlp.gate_proj.qzeros')
- w2_qz = get_tensor(f'model.layers.{i}.mlp.down_proj.qzeros')
- w3_qz = get_tensor(f'model.layers.{i}.mlp.up_proj.qzeros')
-
- w1_s = get_tensor(f'model.layers.{i}.mlp.gate_proj.scales')
- w2_s = get_tensor(f'model.layers.{i}.mlp.down_proj.scales')
- w3_s = get_tensor(f'model.layers.{i}.mlp.up_proj.scales')
-
- w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
- w3_s)
-
- w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
- w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
-
- w13_qw = tp_m_s4(w13_qw, tp)
-
- model_params[f'layers.{i}.feed_forward.w13.qweight'] = w13_qw
- model_params[f'layers.{i}.feed_forward.w13.scales_zeros'] = w13_sz
-
- model_params[f'layers.{i}.feed_forward.w2.qweight'] = w2_qw
- model_params[f'layers.{i}.feed_forward.w2.scales_zeros'] = w2_sz
-
- # norm weights
- attn_norm = get_tensor(f'model.layers.{i}.input_layernorm.weight')
- ffn_norm = get_tensor(
- f'model.layers.{i}.post_attention_layernorm.weight')
-
- model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
- model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm
-
- other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
- ('norm.weight', 'model.norm.weight'),
- ('output.weight', 'lm_head.weight')]
- for ft, hf in other:
- model_params[ft] = get_tensor(hf)
-
- return export(model_name,
- num_layer,
- norm_eps,
- kv_head_num,
- model_params,
- tokenizer_path,
- triton_models_path,
- tp,
- weight_type='int4',
- group_size=group_size,
- rope_theta=rope_theta)
-
-
-def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
- triton_models_path: str, tp: int):
- """Deploy a model with huggingface transformers' format.
-
- Args:
- model_name (str): the name of the to-be-deployed model
- model_path (str): the path of the directory where the model weight
- files are
- tokenizer_path (str): the path of the tokenizer model path
- triton_models_path (str): the path of the exported triton models
- tp (int): the number of tensor parallelism
- quant_path (str): path of the quantized model, which can be None
- group_size (int): a parameter used in AWQ to quantize fp16 weights
- to 4 bits
- """
-
- if osp.exists(model_path):
- shutil.copy(osp.join(model_path, 'qwen.tiktoken'),
- osp.join(triton_models_path, 'tokenizer'))
- for _file in os.listdir(model_path):
- if _file.endswith('.json') or _file.endswith('.py'):
- json_path = osp.join(model_path, _file)
- shutil.copy(json_path,
- osp.join(triton_models_path, 'tokenizer', _file))
- with get_package_root_path() as root_path:
- shutil.copy(osp.join(root_path, 'tokenizer.py'),
- osp.join(triton_models_path, 'tokenizer'))
- else:
- print(f'tokenizer model {tokenizer_path} does not exist')
- exit(-1)
-
- # read model arguments from params.json
- try:
- params_path = osp.join(model_path, 'config.json')
- with open(params_path) as f:
- config = json.load(f)
- num_layer = config['num_hidden_layers']
- norm_eps = config['layer_norm_epsilon']
- rope_theta = float(config.get('rotary_emb_base', 10000.0))
- if 'num_key_value_heads' in config:
- kv_head_num = config['num_key_value_heads']
- else:
- kv_head_num = config['num_attention_heads']
- seq_length = config['seq_length']
- use_dynamic_ntk = config['use_dynamic_ntk']
- use_logn_attn = config['use_logn_attn']
- except Exception as e:
- print(f'get "num_hidden_layers" and "layer_norm_epsilon" from '
- f'{params_path} failed: {e}')
- return False
-
- # convert weights from hf to turbomind
- model_params = {}
-
- _params = load_checkpoint(model_path)
-
- def get_tensor(name, trans=True):
- """return a transposed tensor according its name."""
- if trans:
- return _params[name].cuda().t()
- else:
- return _params[name].cuda()
-
- for i in range(num_layer):
- print(i)
-
- # qkv weights
- qkv_w = get_tensor(f'transformer.h.{i}.attn.c_attn.weight')
- q_w, k_w, v_w = torch.split(qkv_w, qkv_w.size(-1) // 3, dim=-1)
- q_w, k_w = permute(q_w), permute(k_w)
- qkv_w = merge_qkv(q_w, k_w, v_w, tp, dim=2)
- model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv_w
-
- # qkv bias
- qkv_b = get_tensor(f'transformer.h.{i}.attn.c_attn.bias')
- q_b, k_b, v_b = torch.split(qkv_b, qkv_b.size(-1) // 3)
- q_b, k_b = permute(q_b), permute(k_b)
- qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
- model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
-
- # o weights
- o_w = get_tensor(f'transformer.h.{i}.attn.c_proj.weight')
- model_params[f'layers.{i}.attention.wo.weight'] = o_w
- model_params[f'layers.{i}.attention.wo.bias'] = torch.zeros_like(q_b)
-
- # ffn weights
- # ours: w2(silu(w1(x)) * w3(x))
- # qwen: c_proj(w1(x) * silu(w2(x)))
- w1 = get_tensor(f'transformer.h.{i}.mlp.w2.weight')
- w3 = get_tensor(f'transformer.h.{i}.mlp.w1.weight')
- w2 = get_tensor(f'transformer.h.{i}.mlp.c_proj.weight')
- model_params[f'layers.{i}.feed_forward.w1.weight'] = w1
- model_params[f'layers.{i}.feed_forward.w2.weight'] = w2
- model_params[f'layers.{i}.feed_forward.w3.weight'] = w3
-
- # norm weights
- attn_norm = get_tensor(f'transformer.h.{i}.ln_1.weight')
- ffn_norm = get_tensor(f'transformer.h.{i}.ln_2.weight')
-
- model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
- model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm
-
- other = [('tok_embeddings.weight', 'transformer.wte.weight'),
- ('norm.weight', 'transformer.ln_f.weight'),
- ('output.weight', 'lm_head.weight')]
- for ft, hf in other:
- model_params[ft] = get_tensor(hf, trans=False)
-
- return export(model_name,
- num_layer,
- norm_eps,
- kv_head_num,
- model_params,
- model_path,
- triton_models_path,
- tp,
- max_position_embeddings=seq_length,
- use_dynamic_ntk=use_dynamic_ntk,
- use_logn_attn=use_logn_attn,
- rope_theta=rope_theta,
- tokenizer_info=tokenizer_info_qwen)
-
-
-def pack_model_repository(workspace_path: str):
- """package the model repository.
-
- Args:
- workspace_path: the path of workspace
- """
- os.symlink(src='../../tokenizer',
- dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
- '1', 'tokenizer'))
- os.symlink(src='../../tokenizer',
- dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
- '1', 'tokenizer'))
- os.symlink(src='../../weights',
- dst=osp.join(workspace_path, 'triton_models', 'interactive',
- '1', 'weights'))
- model_repo_dir = osp.join(workspace_path, 'model_repository')
- os.makedirs(model_repo_dir, exist_ok=True)
- os.symlink(src=osp.join('../triton_models/interactive'),
- dst=osp.join(model_repo_dir, 'turbomind'))
- os.symlink(src=osp.join('../triton_models/preprocessing'),
- dst=osp.join(model_repo_dir, 'preprocessing'))
- os.symlink(src=osp.join('../triton_models/postprocessing'),
- dst=osp.join(model_repo_dir, 'postprocessing'))
-
-
-def main(model_name: str,
- model_path: str,
- model_format: str = None,
- tokenizer_path: str = None,
- dst_path: str = './workspace',
- tp: int = 1,
- quant_path: str = None,
- group_size: int = 0):
- """deploy llama family models via turbomind.
-
- Args:
- model_name (str): the name of the to-be-deployed model, such as
- llama-7b, llama-13b, vicuna-7b and etc
- model_path (str): the directory path of the model
- model_format (str): the format of the model, fb or hf. 'fb' stands for
- META's llama format, and 'hf' means huggingface format
- tokenizer_path (str): the path of tokenizer model
- dst_path (str): the destination path that saves outputs
- tp (int): the number of GPUs used for tensor parallelism, should be 2^n
- quant_path (str): path of the quantized model, which can be None
- group_size (int): a parameter used in AWQ to quantize fp16 weights
- to 4 bits
- """
- assert model_name in MODELS.module_dict.keys(), \
- f"'{model_name}' is not supported. " \
- f'The supported models are: {MODELS.module_dict.keys()}'
-
- assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
-
- if model_format is None:
- model_format = 'qwen' if model_name == 'qwen-7b' else 'hf'
-
- if model_format not in supported_formats:
- print(f'the model format "{model_format}" is not supported. '
- f'The supported format are: {supported_formats}')
- exit(-1)
-
- if model_format == 'llama' and tokenizer_path is None:
- print('The model is llama. Its tokenizer model path should be '
- 'specified')
- exit(-1)
-
- if not create_workspace(dst_path):
- exit(-1)
-
- triton_models_path = copy_triton_model_templates(dst_path)
- if triton_models_path is None:
- exit(-1)
-
- if model_format == 'llama':
- res = deploy_llama(model_name, model_path, tokenizer_path,
- triton_models_path, tp)
- elif model_format == 'hf':
- res = deploy_hf(model_name, model_path, tokenizer_path,
- triton_models_path, tp)
- elif model_format == 'awq':
- res = deploy_awq(model_name, model_path, tokenizer_path,
- triton_models_path, tp, quant_path, group_size)
- elif model_format == 'qwen':
- res = deploy_qwen(model_name, model_path, tokenizer_path,
- triton_models_path, tp)
-
- # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
- with open(osp.join(triton_models_path, 'interactive/config.pbtxt'),
- 'a') as f:
- param = \
- 'parameters {\n key: "tensor_para_size"\n value: {\n ' \
- 'string_value: ' + f'"{tp}"\n' + ' }\n}\n' + \
- 'parameters {\n key: "model_name"\n value: {\n ' \
- 'string_value: ' + f'"{model_name}"\n' + ' }\n}\n'
- f.write(param)
- if not res:
- print(f'deploy model "{model_name}" via turbomind failed')
- destroy_workspace(dst_path)
- exit(-1)
-
- # pack model repository for triton inference server
- pack_model_repository(dst_path)
-
- # update the value of $TP in `service_docker_up.sh`
- file_path = osp.join(dst_path, 'service_docker_up.sh')
- with open(file_path, 'r') as f:
- content = f.read()
- content = re.sub('TP=1', f'TP={tp}', content)
- with open(file_path, 'w') as f:
- f.write(content)
-
-
-if __name__ == '__main__':
- fire.Fire(main)
diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
index 296d453ed4..231601fde0 100644
--- a/lmdeploy/tokenizer.py
+++ b/lmdeploy/tokenizer.py
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
+import os
import os.path as osp
from typing import Optional, Sequence, Union
@@ -16,7 +17,7 @@ class SentencePieceTokenizer:
def __init__(self, model_file: str):
from sentencepiece import SentencePieceProcessor
self.model = SentencePieceProcessor(model_file=model_file)
- self._no_prefix_space_tokens = None
+ self._prefix_space_tokens = None
@property
def vocab_size(self):
@@ -34,19 +35,20 @@ def eos_token_id(self):
return self.model.eos_id()
@property
- def no_prefix_space_tokens(self):
+ def prefix_space_tokens(self):
"""tokens without prefix space."""
- if self._no_prefix_space_tokens is None:
+ if self._prefix_space_tokens is None:
vocab = self.model.IdToPiece(list(range(self.vocab_size)))
- self._no_prefix_space_tokens = {
+ self._prefix_space_tokens = {
i
- for i, tok in enumerate(vocab) if not tok.startswith('▁')
+ for i, tok in enumerate(vocab) if tok.startswith('▁')
}
- return self._no_prefix_space_tokens
+ return self._prefix_space_tokens
def _maybe_add_prefix_space(self, tokens, decoded):
"""maybe add prefix space for incremental decoding."""
- if len(tokens) and tokens[0] not in self.no_prefix_space_tokens:
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
return ' ' + decoded
else:
return decoded
@@ -111,8 +113,7 @@ class HuggingFaceTokenizer:
"""
def __init__(self, model_dir: str, trust_remote_code=True):
- from transformers import (AutoTokenizer, CodeLlamaTokenizerFast,
- LlamaTokenizerFast)
+ from transformers import AutoTokenizer
model_file = osp.join(model_dir, 'tokenizer.model')
backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
model_file_exists = osp.exists(model_file)
@@ -121,20 +122,22 @@ def __init__(self, model_dir: str, trust_remote_code=True):
'It may take long time to initialize the tokenizer.')
self.model = AutoTokenizer.from_pretrained(
model_dir, trust_remote_code=trust_remote_code)
- self.need_padding = isinstance(self.model, LlamaTokenizerFast) \
- or isinstance(self.model, CodeLlamaTokenizerFast)
- self._no_prefix_space_tokens = None
+ self._prefix_space_tokens = None
# save tokenizer.json to reuse
if not osp.exists(backend_tokenizer_file) and model_file_exists:
if hasattr(self.model, 'backend_tokenizer'):
- self.model.backend_tokenizer.save(backend_tokenizer_file)
+ if os.access(model_dir, os.W_OK):
+ self.model.backend_tokenizer.save(backend_tokenizer_file)
if self.model.eos_token_id is None:
generation_config_file = osp.join(model_dir,
'generation_config.json')
- with open(generation_config_file, 'r') as f:
- cfg = json.load(f)
- self.model.eos_token_id = cfg['eos_token_id']
+ if osp.exists(generation_config_file):
+ with open(generation_config_file, 'r') as f:
+ cfg = json.load(f)
+ self.model.eos_token_id = cfg['eos_token_id']
+ elif hasattr(self.model, 'eod_id'): # Qwen remote
+ self.model.eos_token_id = self.model.eod_id
@property
def vocab_size(self):
@@ -152,21 +155,22 @@ def eos_token_id(self):
return self.model.eos_token_id
@property
- def no_prefix_space_tokens(self):
+ def prefix_space_tokens(self):
"""tokens without prefix space."""
- if self._no_prefix_space_tokens is None:
+ if self._prefix_space_tokens is None:
vocab = self.model.convert_ids_to_tokens(
list(range(self.vocab_size)))
- self._no_prefix_space_tokens = {
+ self._prefix_space_tokens = {
i
- for i, tok in enumerate(vocab) if not tok.startswith('▁')
+ for i, tok in enumerate(vocab)
+ if tok.startswith('▁' if isinstance(tok, str) else b' ')
}
- return self._no_prefix_space_tokens
+ return self._prefix_space_tokens
def _maybe_add_prefix_space(self, tokens, decoded):
"""maybe add prefix space for incremental decoding."""
- if self.need_padding and len(
- tokens) and tokens[0] not in self.no_prefix_space_tokens:
+ if len(tokens) and not decoded.startswith(' ') and\
+ tokens[0] in self.prefix_space_tokens:
return ' ' + decoded
else:
return decoded
diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
index de31a5daa7..8091dd29b4 100644
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,12 +4,6 @@
import os.path as osp
import random
-import fire
-
-from lmdeploy import turbomind as tm
-from lmdeploy.model import MODELS
-from lmdeploy.tokenizer import Tokenizer
-
os.environ['TM_LOG_LEVEL'] = 'ERROR'
@@ -73,9 +67,9 @@ def get_gen_param(cap,
def main(model_path,
session_id: int = 1,
cap: str = 'chat',
- sys_instruct: str = None,
- tp=1,
- stream_output=True,
+ tp: int = 1,
+ stream_output: bool = True,
+ request_output_len: int = 512,
**kwargs):
"""An example to perform model inference through the command line
interface.
@@ -85,24 +79,27 @@ def main(model_path,
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
- sys_instruct (str): the content of 'system' role, which is used by
- conversational model
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
**kwarg (dict): other arguments for initializing model's chat template
"""
+ from lmdeploy import turbomind as tm
+ from lmdeploy.tokenizer import Tokenizer
+
tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
- tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
+ tm_model = tm.TurboMind(model_path,
+ eos_id=tokenizer.eos_token_id,
+ tp=tp,
+ capability=cap,
+ **kwargs)
generator = tm_model.create_instance()
nth_round = 1
step = 0
seed = random.getrandbits(64)
model_name = tm_model.model_name
- model = MODELS.get(model_name)(capability=cap, **kwargs) \
- if sys_instruct is None else MODELS.get(model_name)(
- capability=cap, system=sys_instruct, **kwargs)
+ model = tm_model.model
print(f'session {session_id}')
while True:
@@ -112,12 +109,13 @@ def main(model_path,
elif prompt == 'end':
prompt = model.get_prompt('', nth_round == 1)
input_ids = tokenizer.encode(prompt)
- for outputs in generator.stream_infer(session_id=session_id,
- input_ids=[input_ids],
- request_output_len=512,
- sequence_start=False,
- sequence_end=True,
- stream_output=stream_output):
+ for outputs in generator.stream_infer(
+ session_id=session_id,
+ input_ids=[input_ids],
+ request_output_len=request_output_len,
+ sequence_start=False,
+ sequence_end=True,
+ stream_output=stream_output):
pass
nth_round = 1
step = 0
@@ -125,13 +123,14 @@ def main(model_path,
else:
prompt = model.get_prompt(prompt, nth_round == 1)
input_ids = tokenizer.encode(prompt)
- if step + len(input_ids) >= tm_model.session_len:
+ if step + len(
+ input_ids) + request_output_len >= tm_model.session_len:
print('WARNING: exceed session max length.'
' Please end the session.')
continue
gen_param = get_gen_param(cap, model.sampling_param, nth_round,
- step, **kwargs)
+ step, request_output_len, **kwargs)
print(f'{prompt} ', end='', flush=True)
response_size = 0
@@ -145,6 +144,11 @@ def main(model_path,
res, tokens = outputs[0]
# decode res
response = tokenizer.decode(res.tolist(), offset=response_size)
+ # utf-8 char at the end means it's a potential unfinished
+ # byte sequence, continue to concate it with the next
+ # sequence and decode them together
+ if response.endswith('�'):
+ continue
response = valid_str(response)
print(f'{response}', end='', flush=True)
response_size = tokens
@@ -157,4 +161,6 @@ def main(model_path,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py
index daef35298c..5ba4675c59 100644
--- a/lmdeploy/turbomind/decode.py
+++ b/lmdeploy/turbomind/decode.py
@@ -2,7 +2,6 @@
import os
import os.path as osp
-import fire
import torch
from lmdeploy import turbomind as tm
@@ -37,4 +36,6 @@ def main(model_path, inputs):
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/deploy/__init__.py b/lmdeploy/turbomind/deploy/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
new file mode 100644
index 0000000000..4876002020
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import re
+import shutil
+from pathlib import Path
+
+import fire
+
+from lmdeploy.model import MODELS
+
+from .source_model.base import INPUT_MODELS
+from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
+
+supported_formats = ['llama', 'hf', 'awq', None]
+special_input_model_map = {
+ 'qwen': 'qwen',
+ 'baichuan': 'baichuan',
+ 'baichuan2': 'baichuan2'
+}
+
+
+def get_package_root_path():
+ """Get lmdeploy root path."""
+ import lmdeploy
+ return Path(lmdeploy.__file__).parent
+
+
+def get_tokenizer_path(model_path: str, tokenizer_path: str):
+ """Get tokenizer path if not given."""
+ if tokenizer_path is not None:
+ assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
+ return tokenizer_path
+ candidate = ['tokenizer.model', 'qwen.tiktoken']
+ for name in candidate:
+ tmp_path = osp.join(model_path, name)
+ if osp.exists(tmp_path):
+ tokenizer_path = tmp_path
+ break
+ assert tokenizer_path, 'please supply tokenizer path by --tokenizer-path'
+ return tokenizer_path
+
+
+def get_model_format(model_name: str, model_format: str):
+ """Get model format if not given or equal awq."""
+ # get model name prefix
+ if model_name.find('-') != -1:
+ model_name = model_name[:model_name.find('-')]
+ # rules:
+ # 1) llama -> match special -> hf (if not matched)
+ # 2) append awq (if model_format is awq)
+ inferred_model_format = model_format
+ if model_format in [None, 'hf']:
+ inferred_model_format = special_input_model_map.get(model_name, 'hf')
+ elif model_format == 'awq':
+ inferred_model_format = special_input_model_map.get(model_name,
+ 'hf') + '-awq'
+ return inferred_model_format
+
+
+def create_workspace(_path: str):
+ """Create a workspace.
+
+ Args:
+ _path (str): the path of the workspace
+ """
+ if osp.exists(_path):
+ print(f'remove workspace in directory {_path}')
+ shutil.rmtree(_path)
+ print(f'create workspace in directory {_path}')
+ os.makedirs(_path)
+
+
+def copy_triton_model_templates(_path: str):
+ """copy triton model templates to the specified path.
+
+ Args:
+ _path (str): the target path
+ Returns:
+ str: the path of the triton models
+ """
+
+ root = get_package_root_path()
+ dir_path = osp.join(root, 'serve', 'turbomind')
+ triton_models_path = osp.join(dir_path, 'triton_models')
+ dst_path = osp.join(_path, 'triton_models')
+ print(f'copy triton model templates from "{triton_models_path}" to '
+ f'"{dst_path}"')
+ shutil.copytree(triton_models_path, dst_path, symlinks=True)
+ service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh')
+ print(f'copy service_docker_up.sh from "{service_docker_up_file}" to '
+ f'"{_path}"')
+ shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
+ return dst_path
+
+
+def copy_tokenizer(model_path: str, tokenizer_path: str,
+ triton_models_path: str):
+ """Copy tokenizer."""
+ shutil.copy(
+ tokenizer_path,
+ osp.join(triton_models_path,
+ osp.join('tokenizer', osp.basename(tokenizer_path))))
+ for _file in os.listdir(model_path):
+ if _file.endswith('.json') or _file.endswith('.py'):
+ json_path = osp.join(model_path, _file)
+ shutil.copy(json_path,
+ osp.join(triton_models_path, 'tokenizer', _file))
+ with get_package_root_path() as root_path:
+ shutil.copy(osp.join(root_path, 'tokenizer.py'),
+ osp.join(triton_models_path, 'tokenizer'))
+
+
+def pack_model_repository(workspace_path: str):
+ """package the model repository.
+
+ Args:
+ workspace_path: the path of workspace
+ """
+ os.symlink(src=osp.join('..', '..', 'tokenizer'),
+ dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
+ '1', 'tokenizer'))
+ os.symlink(src=osp.join('..', '..', 'tokenizer'),
+ dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
+ '1', 'tokenizer'))
+ os.symlink(src=osp.join('..', '..', 'weights'),
+ dst=osp.join(workspace_path, 'triton_models', 'interactive',
+ '1', 'weights'))
+ model_repo_dir = osp.join(workspace_path, 'model_repository')
+ os.makedirs(model_repo_dir, exist_ok=True)
+ os.symlink(src=osp.join('..', 'triton_models', 'interactive'),
+ dst=osp.join(model_repo_dir, 'turbomind'))
+ os.symlink(src=osp.join('..', 'triton_models', 'preprocessing'),
+ dst=osp.join(model_repo_dir, 'preprocessing'))
+ os.symlink(src=osp.join('..', 'triton_models', 'postprocessing'),
+ dst=osp.join(model_repo_dir, 'postprocessing'))
+
+
+def main(model_name: str,
+ model_path: str,
+ model_format: str = None,
+ tokenizer_path: str = None,
+ dst_path: str = 'workspace',
+ tp: int = 1,
+ quant_path: str = None,
+ group_size: int = 0):
+ """deploy llama family models via turbomind.
+
+ Args:
+ model_name (str): the name of the to-be-deployed model, such as
+ llama-7b, llama-13b, vicuna-7b and etc
+ model_path (str): the directory path of the model
+ model_format (str): the format of the model, should choose from
+ ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+ format, 'hf' means huggingface llama format, and 'awq' means
+ llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+ the default value is None, which means the model_format will be
+ inferred based on model_name
+ tokenizer_path (str): the path of tokenizer model
+ dst_path (str): the destination path that saves outputs
+ tp (int): the number of GPUs used for tensor parallelism, should be 2^n
+ quant_path (str): Path of the quantized model, which can be None.
+ group_size (int): a parameter used in AWQ to quantize fp16 weights
+ to 4 bits
+ """
+
+ assert model_name in MODELS.module_dict.keys(), \
+ f"'{model_name}' is not supported. " \
+ f'The supported models are: {MODELS.module_dict.keys()}'
+
+ assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
+
+ output_format = 'fp16'
+
+ # get input model format
+ assert model_format in supported_formats, 'the model format ' \
+ f'should be in {supported_formats}'
+
+ inferred_model_format = get_model_format(model_name, model_format)
+ if inferred_model_format not in INPUT_MODELS.module_dict.keys():
+ supported_keys = list(INPUT_MODELS.module_dict.keys())
+ print(f'with model name {model_name} and model formst {model_format}, '
+ f'the inferred model format is {inferred_model_format}, '
+ f'which is not in supported list {supported_keys}')
+ exit(-1)
+
+ # get tokenizer path
+ tokenizer_path = get_tokenizer_path(model_path, tokenizer_path)
+
+ # create workspace
+ create_workspace(dst_path)
+
+ triton_models_path = copy_triton_model_templates(dst_path)
+
+ copy_tokenizer(model_path, tokenizer_path, triton_models_path)
+
+ # turbomind config
+ cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
+ cfg.model_name = model_name
+ cfg.tensor_para_size = tp
+ cfg.rotary_embedding = cfg.size_per_head
+ cfg.group_size = group_size
+ if inferred_model_format.find('awq') != -1:
+ cfg.weight_type = 'int4'
+ output_format = 'w4'
+ assert group_size > 0, 'group_size should > 0'
+
+ # convert
+ print('model_name ', model_name)
+ print('model_format ', model_format)
+ print('inferred_model_format ', inferred_model_format)
+ print('model_path ', model_path)
+ print('tokenizer_path ', tokenizer_path)
+ print('output_format ', output_format)
+ weight_path = osp.join(triton_models_path, 'weights')
+ input_model = INPUT_MODELS.get(inferred_model_format)(
+ model_path=model_path,
+ tokenizer_path=tokenizer_path,
+ ckpt_path=quant_path)
+ output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
+ cfg=cfg,
+ to_file=True,
+ out_dir=weight_path)
+ output_model.export()
+
+ # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
+ with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'),
+ 'a') as f:
+ param = \
+ 'parameters {\n key: "tensor_para_size"\n value: {\n ' \
+ 'string_value: ' + f'"{tp}"\n' + ' }\n}\n' + \
+ 'parameters {\n key: "model_name"\n value: {\n ' \
+ 'string_value: ' + f'"{model_name}"\n' + ' }\n}\n'
+ f.write(param)
+
+ # pack model repository for triton inference server
+ pack_model_repository(dst_path)
+
+ # update the value of $TP in `service_docker_up.sh`
+ file_path = osp.join(dst_path, 'service_docker_up.sh')
+ with open(file_path, 'r') as f:
+ content = f.read()
+ content = re.sub('TP=1', f'TP={tp}', content)
+ with open(file_path, 'w') as f:
+ f.write(content)
+
+
+if __name__ == '__main__':
+ fire.Fire(main)
diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
new file mode 100644
index 0000000000..7c6627c770
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401
+from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel # noqa: F401
+from .llama import LlamaModel # noqa: F401
+from .llama_awq import LlamaAwqModel # noqa: F401
+from .meta_llama import MetaLlamaModel # noqa: F401
+from .qwen import QwenModel # noqa: F401
+from .qwen_awq import QwenAwqModel # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py
new file mode 100644
index 0000000000..46ccb6309d
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class BaichuanReader(LlamaReader):
+ """BaichuanReader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+ """Get q, k, v, o kind for layer i."""
+ result = []
+ pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
+ qkv = self.params[pack_key]
+ result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim))
+ o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}']
+ result.append(o)
+ return (*result, )
+
+ def attn(self, i: int):
+ """Get q, k, v, o weight for layer i."""
+ return self._attn(i, 'weight', 0, 0)
+
+ def attn_bias(self, i: int):
+ """Get q, k, v, o bias for layer i."""
+ return (None, ) * 4
+
+
+class Baichuan2Reader(BaichuanReader):
+ """Baichuan2Reader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def output_weight(self):
+ """Get output."""
+ # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+ tensor = self.params.get('lm_head.weight', None)
+ if tensor is not None:
+ tensor = tensor.cuda()
+ tensor = torch.nn.functional.normalize(tensor)
+ return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan')
+class BaichuanModel(LlamaModel):
+ """Llama model in baichuan format."""
+
+ Reader = BaichuanReader
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+ super().__init__(model_path, tokenizer_path, **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2')
+class Baichuan2Model(LlamaModel):
+ """Llama model in baichuan format."""
+
+ Reader = Baichuan2Reader
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+ super().__init__(model_path, tokenizer_path, **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
new file mode 100644
index 0000000000..d5d60286a8
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+
+
+class BaichuanAwqReader(BaichuanReader):
+ """BaichuanAwqReader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def attn(self, i: int):
+ """Get q, k, v, o qweight for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o qzeros for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scales for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+ def ffn(self, i: int):
+ """Get ffn qweight for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+ def ffn_zero(self, i: int):
+ """Get ffn qzeros for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+ def ffn_scale(self, i: int):
+ """Get ffn scales for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+class Baichuan2AwqReader(BaichuanAwqReader):
+ """Baichuan2AwqReader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def output_weight(self):
+ """Get output."""
+ # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+ tensor = self.params.get('lm_head.weight', None)
+ if tensor is not None:
+ tensor = tensor.cuda()
+ tensor = torch.nn.functional.normalize(tensor)
+ return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan-awq')
+class BaichuanAwqModel(BaichuanModel):
+ """Baichuan awq model in hf format."""
+
+ Reader = BaichuanAwqReader
+
+ def __init__(self,
+ model_path: str,
+ tokenizer_path: str,
+ ckpt_path: str = None,
+ **kwargs):
+ super().__init__(model_path,
+ tokenizer_path,
+ ckpt_path=ckpt_path,
+ **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2-awq')
+class Baichuan2AwqModel(Baichuan2Model):
+ """Baichuan2 awq model in hf format."""
+
+ Reader = Baichuan2AwqReader
+
+ def __init__(self,
+ model_path: str,
+ tokenizer_path: str,
+ ckpt_path: str = None,
+ **kwargs):
+ super().__init__(model_path,
+ tokenizer_path,
+ ckpt_path=ckpt_path,
+ **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
new file mode 100644
index 0000000000..89f18033e9
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, Tuple, Union
+
+import torch
+from mmengine import Registry
+
+INPUT_MODELS = Registry(
+ 'source model', locations=['lmdeploy.turbomind.deploy.source_model.base'])
+
+
+class BaseReader(ABC):
+ """Base checkpoint manager."""
+
+ def __init__(self):
+ pass
+
+ @property
+ @abstractmethod
+ def start_layer_id(self) -> int:
+ """Get the start transformer layer number."""
+ pass
+
+ @property
+ @abstractmethod
+ def end_layer_id(self) -> int:
+ """Get the end transformer layer number."""
+ pass
+
+ @abstractmethod
+ def init_layer_id(self) -> None:
+ """Get start and end transformer layer number."""
+ self._start_layer_id = -1
+ self._end_layer_id = -1
+ layer_count = {}
+ for key in self.params:
+ layer_id = re.findall(self.attn_layer_patten, key)
+ if len(layer_id) == 0:
+ continue
+ layer_id = int(layer_id[0])
+ if layer_id not in layer_count:
+ layer_count[layer_id] = 0
+ layer_count[layer_id] += 1
+ if len(layer_count) == 0:
+ return
+ if not (len(layer_count) > 1 or self.last_bin):
+ return
+ max_count = max([layer_count[layer_id] for layer_id in layer_count])
+ valid_layer_id = [
+ layer_id for layer_id in layer_count
+ if layer_count[layer_id] == max_count
+ ]
+ self._start_layer_id = min(valid_layer_id)
+ self._end_layer_id = max(valid_layer_id) + 1
+
+ @abstractmethod
+ def clean_up(self, last: bool) -> None:
+ """Clean up unused params."""
+ if last:
+ self.params.clear()
+ else:
+ to_remove = []
+ for key in self.params:
+ layer_id = re.findall(self.attn_layer_patten, key)
+ if len(layer_id) == 0:
+ to_remove.append(key)
+ else:
+ layer_id = int(layer_id[0])
+ if layer_id < self.end_layer_id:
+ to_remove.append(key)
+ for key in to_remove:
+ self.params.pop(key, None)
+ torch.cuda.empty_cache()
+
+ @abstractmethod
+ def tok_embeddings(self) -> Union[torch.Tensor, None]:
+ """Get embeddings."""
+ pass
+
+ @abstractmethod
+ def norm_weight(self) -> Union[torch.Tensor, None]:
+ """Get norm."""
+ pass
+
+ @abstractmethod
+ def output_weight(self) -> Union[torch.Tensor, None]:
+ """Get output."""
+ pass
+
+ @abstractmethod
+ def attn(self, i: int) -> Tuple[torch.Tensor]:
+ """Get q, k, v, o weight for layer i."""
+ pass
+
+ @abstractmethod
+ def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]:
+ """Get q, k, v, o bias for layer i."""
+ pass
+
+ @abstractmethod
+ def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+ """Get q, k, v, o zero point for layer i."""
+ pass
+
+ @abstractmethod
+ def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+ """Get q, k, v, o scale for layer i."""
+ pass
+
+ @abstractmethod
+ def attn_norm(self, i: int) -> torch.Tensor:
+ """Get attn norm for layer i."""
+ pass
+
+ @abstractmethod
+ def ffn(self, i: int) -> Tuple[torch.Tensor]:
+ """Get ffn weight for layer i."""
+ pass
+
+ @abstractmethod
+ def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+ """Get ffn zero point for layer i."""
+ pass
+
+ @abstractmethod
+ def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+ """Get ffn scale for layer i."""
+ pass
+
+ @abstractmethod
+ def ffn_norm(self, i: int) -> torch.Tensor:
+ """Get ffn norm for layer i."""
+ pass
+
+
+class BaseInputModel(ABC):
+ """Base class for input model."""
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+ """Constructor for BaseInputModel.
+
+ Args:
+ model_path (str): the path of the model.
+ tokenizer_path (str): the path of the tokenizer model.
+ """
+ self.model_path = model_path
+ self.tokenizer_path = tokenizer_path
+
+ @property
+ @abstractmethod
+ def nmgrs(self) -> int:
+ """Get number of checkpoint."""
+ pass
+
+ @abstractmethod
+ def get_mgrs(self) -> Iterator[BaseReader]:
+ """Conctruct all BaseReader."""
+ pass
+
+ @abstractmethod
+ def tokenizer_info(self):
+ """Read tokenizer info."""
+ pass
+
+ @abstractmethod
+ def model_info(self) -> Dict:
+ """Read model info."""
+ pass
+
+ def bins(self) -> Iterator[BaseReader]:
+ """Get Reader."""
+ for mgr in self.get_mgrs():
+ yield mgr
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
new file mode 100644
index 0000000000..f800260467
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+
+import torch
+from safetensors.torch import load_file
+
+from lmdeploy.tokenizer import Tokenizer
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+class LlamaReader(BaseReader):
+ """LlamaReader."""
+
+ attn_layer_patten = r'model.layers.([0-9]+).'
+ tok_embeddings_key = 'model.embed_tokens.weight'
+ norm_weight_key = 'model.norm.weight'
+ output_weight_key = 'lm_head.weight'
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__()
+ self.params = unused_params
+ self.params.update(new_params)
+ self.last_bin = last_bin
+ self.init_layer_id()
+
+ def init_layer_id(self):
+ """Get start/end transformer layer id."""
+ super().init_layer_id()
+
+ def clean_up(self, last: bool) -> None:
+ """Clean up unused params."""
+ super().clean_up(last)
+
+ @property
+ def start_layer_id(self):
+ """Get start transformer layer id."""
+ return self._start_layer_id
+
+ @property
+ def end_layer_id(self):
+ """Get end transformer layer id."""
+ return self._end_layer_id
+
+ def tok_embeddings(self):
+ """Get embeddings."""
+ return self.params.get(self.tok_embeddings_key, None)
+
+ def norm_weight(self):
+ """Get norm."""
+ return self.params.get(self.norm_weight_key, None)
+
+ def output_weight(self):
+ """Get output."""
+ return self.params.get(self.output_weight_key, None)
+
+ def _attn(self, i: int, kind: str, allow_none=False):
+ """Get q, k, v, o kind for layer i."""
+ result = []
+ for key in ['q', 'k', 'v', 'o']:
+ tensor = self.params.get(
+ f'model.layers.{i}.self_attn.{key}_proj.{kind}')
+ if not allow_none:
+ assert tensor is not None
+ result.append(tensor)
+ return (*result, )
+
+ def attn(self, i: int):
+ """Get q, k, v, o weight for layer i."""
+ return self._attn(i, 'weight')
+
+ def attn_bias(self, i: int):
+ """Get q, k, v, o bias for layer i."""
+ return self._attn(i, 'bias', allow_none=True)
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o zero point for layer i."""
+ return (None, ) * 4
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scale for layer i."""
+ return (None, ) * 4
+
+ def attn_norm(self, i: int):
+ """Get attn norm for layer i."""
+ return self.params[f'model.layers.{i}.input_layernorm.weight']
+
+ def _ffn(self, i: int, kind: str):
+ """Get ffn kind for layer i."""
+ result = []
+ for key in ['gate', 'down', 'up']:
+ tensor = self.params[f'model.layers.{i}.mlp.{key}_proj.{kind}']
+ result.append(tensor)
+ return (*result, )
+
+ def ffn(self, i: int):
+ """Get ffn weight for layer i."""
+ return self._ffn(i, 'weight')
+
+ def ffn_zero(self, i: int):
+ """Get ffn zero point for layer i."""
+ return (None, ) * 3
+
+ def ffn_scale(self, i: int):
+ """Get ffn scale for layer i."""
+ return (None, ) * 3
+
+ def ffn_norm(self, i: int):
+ """Get ffn norm for layer i."""
+ return self.params[f'model.layers.{i}.post_attention_layernorm.weight']
+
+
+@INPUT_MODELS.register_module(name='hf')
+class LlamaModel(BaseInputModel):
+ """Llama model in hf format."""
+
+ Reader = LlamaReader
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+ super().__init__(model_path, tokenizer_path)
+ ckpt_path = kwargs.get('ckpt_path')
+ if ckpt_path is None:
+ ckpt_path = model_path
+ self.ckpt_path = ckpt_path
+ self.ckpt_files = self.get_ckpt()
+
+ def get_ckpt(self):
+ """Get weight files."""
+ suffixes = ['.safetensors', '.bin']
+ files = []
+ for suffix in suffixes:
+ files = [
+ file for file in os.listdir(self.ckpt_path)
+ if file.endswith(suffix)
+ ]
+ if len(files) > 0:
+ break
+ files = sorted(files)
+ return files
+
+ @property
+ def nmgrs(self):
+ """Get number of checkpoint."""
+ return len(self.ckpt_files)
+
+ def get_mgrs(self):
+ """Conctruct all Reader."""
+ assert self.nmgrs > 0, \
+ f'could not find checkpoints in {self.ckpt_path}'
+ unused_params = {}
+ try:
+ for i, ckpt in enumerate(self.ckpt_files):
+ is_last_bin = i == len(self.ckpt_files) - 1
+ if ckpt.endswith('.bin'):
+ new_params = torch.load(osp.join(self.ckpt_path, ckpt),
+ map_location='cpu')
+ else:
+ new_params = load_file(osp.join(self.ckpt_path, ckpt))
+ ret = self.Reader(new_params, unused_params,
+ i == self.nmgrs - 1)
+ yield ret
+ ret.clean_up(is_last_bin)
+ except GeneratorExit:
+ ret.clean_up(True)
+
+ def tokenizer_info(self):
+ """Read tokenizer info."""
+ assert osp.isdir(self.model_path), self.model_path
+ tk_model = Tokenizer(self.model_path)
+ n_words = tk_model.vocab_size
+ bos_id = tk_model.bos_token_id
+ eos_id = tk_model.eos_token_id
+ return n_words, bos_id, eos_id
+
+ def model_info(self):
+ """Read model info."""
+ params_path = osp.join(self.model_path, 'config.json')
+ with open(params_path) as f:
+ model_arg = json.load(f)
+ num_layer = model_arg['num_hidden_layers']
+ norm_eps = model_arg['rms_norm_eps']
+ if 'num_key_value_heads' in model_arg:
+ kv_head_num = model_arg['num_key_value_heads']
+ else:
+ kv_head_num = model_arg['num_attention_heads']
+ rope_theta = float(model_arg.get('rope_theta', 10000.0))
+ max_position_embeddings = int(
+ model_arg.get('max_position_embeddings', 0))
+ repo_scaling = bool(model_arg.get('rope_scaling', False))
+
+ return dict(num_layer=num_layer,
+ norm_eps=norm_eps,
+ kv_head_num=kv_head_num,
+ rope_theta=rope_theta,
+ max_position_embeddings=max_position_embeddings,
+ use_dynamic_ntk=int(repo_scaling))
diff --git a/lmdeploy/turbomind/deploy/source_model/llama_awq.py b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
new file mode 100644
index 0000000000..9d2ae8ac50
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+def ensure_fp16orint32(tensors: torch.Tensor):
+ """Ensure tensors in fp16/int32 format."""
+ result = []
+ for tensor in tensors:
+ if tensor is not None:
+ if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]:
+ result.append(tensor.half())
+ else:
+ assert tensor.dtype == torch.int32
+ result.append(tensor)
+ else:
+ result.append(None)
+ return (*result, )
+
+
+class LlamaAwqReader(LlamaReader):
+ """LlamaAwqReader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def attn(self, i: int):
+ """Get q, k, v, o qweight for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qweight'))
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o qzeros for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qzeros'))
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scales for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'scales'))
+
+ def ffn(self, i: int):
+ """Get ffn qweight for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+ def ffn_zero(self, i: int):
+ """Get ffn qzeros for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+ def ffn_scale(self, i: int):
+ """Get ffn scales for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='hf-awq')
+class LlamaAwqModel(LlamaModel):
+ """Llama Awq model in hf format."""
+
+ Reader = LlamaAwqReader
+
+ def __init__(self,
+ model_path: str,
+ tokenizer_path: str,
+ ckpt_path: str = None,
+ **kwargs):
+ super().__init__(model_path,
+ tokenizer_path,
+ ckpt_path=ckpt_path,
+ **kwargs)
diff --git a/lmdeploy/turbomind/deploy/source_model/meta_llama.py b/lmdeploy/turbomind/deploy/source_model/meta_llama.py
new file mode 100644
index 0000000000..bc26361c73
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/meta_llama.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import torch
+from sentencepiece import SentencePieceProcessor
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+def reverse_permute(x: torch.Tensor, size_per_head: int = 128):
+ """reverse permute to hf format."""
+ if x.shape[-1] > 1:
+ dim = x.shape[-1]
+ n_heads = dim // size_per_head
+ return x.view(-1, n_heads, dim // n_heads // 2,
+ 2).transpose(2, 3).reshape(-1, dim)
+ else: # scales, zeros
+ dim = x.shape[0]
+ n_heads = dim // size_per_head
+ return x.view(n_heads, dim // n_heads // 2, 2,
+ 1).transpose(1, 2).reshape(dim, 1)
+
+
+class MetaLlamaReader(BaseReader):
+ """MetaLlamaReader."""
+
+ def __init__(self, model_path: str, start_layer_id: int,
+ end_layer_id: int):
+ super().__init__()
+ self._start_layer_id = start_layer_id
+ self._end_layer_id = end_layer_id
+ self.params = self.load_model(model_path)
+
+ def init_layer_id(self):
+ """Empty."""
+ pass
+
+ def load_model(self, model_path):
+ """Load all parameters."""
+ checkpoints = []
+ for pattern in ['*.pth', '*.pt']:
+ checkpoints += sorted(Path(model_path).glob(pattern))
+ n_ckpt = len(checkpoints)
+ model_params = {}
+
+ def get_param(_name, _size):
+ if _name not in model_params:
+ model_params[_name] = torch.zeros(_size,
+ dtype=torch.float16,
+ device='cpu')
+ return model_params[_name]
+
+ from tqdm import tqdm
+ pbar = tqdm(total=n_ckpt, desc='load meta ckpt', leave=False)
+ for i, ckpt_path in enumerate(checkpoints):
+ ckpt = torch.load(ckpt_path, map_location='cpu')
+ for param_name, param_data in ckpt.items():
+ key, ext = param_name.split('.')[-2:]
+ # column-parallel
+ if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
+ size = param_data.size(0)
+ if ext == 'weight':
+ param = get_param(
+ param_name,
+ [size * n_ckpt, param_data.size(1)])
+ param.data[size * i:size * (i + 1), :] = param_data
+ else: # bias
+ param = get_param(param_name, [size * n_ckpt])
+ param.data[size * i:size * (i + 1)] = param_data
+ # row-parallel
+ elif key in ['w2', 'wo', 'tok_embeddings']:
+ size = param_data.size(-1)
+ if ext == 'weight':
+ param = get_param(param_name,
+ [param_data.size(0), size * n_ckpt])
+ param.data[:, size * i:size * (i + 1)] = param_data
+ else: # bias
+ param = get_param(param_name, [size])
+ param.data = param_data
+ elif i == 0:
+ param = get_param(param_name, param_data.size())
+ param.data = param_data
+ del ckpt
+ pbar.update(1)
+ pbar.close()
+
+ for name, param in model_params.items():
+ # transpose all weights as TurboMind is expecting column-major
+ # (output_dims, input_dims) -> (input_dims, output_dims)
+ key = name.split('.')[-2]
+ if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
+ param.data = param.data.t()
+ if key in ['wq', 'wk']:
+ param.data = reverse_permute(param.data)
+ return model_params
+
+ def clean_up(self, last: bool) -> None:
+ """Clean up unused params."""
+ self.params.clear()
+
+ @property
+ def start_layer_id(self):
+ """Get start transformer layer id."""
+ return self._start_layer_id
+
+ @property
+ def end_layer_id(self):
+ """Get end transformer layer id."""
+ return self._end_layer_id
+
+ def tok_embeddings(self):
+ """Get embeddings."""
+ return self.params.get('tok_embeddings.weight')
+
+ def norm_weight(self):
+ """Get norm."""
+ return self.params.get('norm.weight')
+
+ def output_weight(self):
+ """Get output."""
+ return self.params.get('output.weight')
+
+ def attn(self, i: int):
+ """Get q, k, v, o weight for layer i."""
+ result = []
+ for key in ['wq', 'wk', 'wv', 'wo']:
+ tensor = self.params[f'layers.{i}.attention.{key}.weight']
+ tensor = tensor.t() if tensor is not None else None
+ result.append(tensor)
+ return (*result, )
+
+ def attn_bias(self, i: int):
+ """Get q, k, v, o bias for layer i."""
+ result = []
+ for key in ['wq', 'wk', 'wv', 'wo']:
+ tensor = self.params.get(f'layers.{i}.attention.{key}.bias')
+ tensor = tensor.t() if tensor is not None else None
+ result.append(tensor)
+ return (*result, )
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o zero point for layer i."""
+ return (None, ) * 4
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scale for layer i."""
+ return (None, ) * 4
+
+ def attn_norm(self, i: int):
+ """Get attn norm for layer i."""
+ return self.params[f'layers.{i}.attention_norm.weight']
+
+ def ffn(self, i: int):
+ """Get ffn weight for layer i."""
+ result = []
+ for key in ['w1', 'w2', 'w3']:
+ tensor = self.params[f'layers.{i}.feed_forward.{key}.weight']
+ result.append(tensor.t())
+ return (*result, )
+
+ def ffn_zero(self, i: int):
+ """Get ffn zero point for layer i."""
+ return (None, ) * 3
+
+ def ffn_scale(self, i: int):
+ """Get ffn scale for layer i."""
+ return (None, ) * 3
+
+ def ffn_norm(self, i: int):
+ """Get ffn norm for layer i."""
+ return self.params[f'layers.{i}.ffn_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='llama')
+class MetaLlamaModel(BaseInputModel):
+ """Llama model in fb format."""
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+ super().__init__(model_path, tokenizer_path, **kwargs)
+
+ @property
+ def nmgrs(self):
+ """Get number of checkpoint."""
+ return 1
+
+ def get_mgrs(self):
+ """Conctruct all BaseReader."""
+ end_layer_id = self.model_info()['num_layer']
+ try:
+ if hasattr(self, 'meta_reader'):
+ yield self.meta_reader
+ else:
+ self.meta_reader = MetaLlamaReader(self.model_path, 0,
+ end_layer_id)
+ yield self.meta_reader
+ except GeneratorExit:
+ pass
+
+ def tokenizer_info(self):
+ """Read tokenizer info."""
+ assert osp.isfile(self.tokenizer_path), self.tokenizer_path
+ sp_model = SentencePieceProcessor(model_file=self.tokenizer_path)
+ # BOS / EOS token IDs
+ n_words = sp_model.vocab_size()
+ bos_id = sp_model.bos_id()
+ eos_id = sp_model.eos_id()
+ return n_words, bos_id, eos_id
+
+ def model_info(self):
+ """Read model info."""
+ params_path = osp.join(self.model_path, 'params.json')
+ with open(params_path) as f:
+ model_arg = json.load(f)
+ num_layer = model_arg['n_layers']
+ norm_eps = model_arg['norm_eps']
+ head_num = model_arg.get('n_heads', 32)
+ kv_head_num = model_arg.get('n_kv_heads', head_num)
+
+ return dict(num_layer=num_layer,
+ norm_eps=norm_eps,
+ head_num=head_num,
+ kv_head_num=kv_head_num)
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
new file mode 100644
index 0000000000..09ff93afc5
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class QwenReader(LlamaReader):
+ """QwenReader."""
+
+ attn_layer_patten = r'transformer.h.([0-9]+).'
+ tok_embeddings_key = 'transformer.wte.weight'
+ norm_weight_key = 'transformer.ln_f.weight'
+ output_weight_key = 'lm_head.weight'
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+ """Get q, k, v, o kind for layer i."""
+ qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
+ q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
+ o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
+ if o is None:
+ o = torch.zeros_like(q)
+ return q, k, v, o
+
+ def attn(self, i: int):
+ """Get q, k, v, o weight for layer i."""
+ return self._attn(i, 'weight', 0, 0)
+
+ def attn_bias(self, i: int):
+ """Get q, k, v, o bias for layer i."""
+ return self._attn(i, 'bias', -1, 0)
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o zero point for layer i."""
+ return (None, ) * 4
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scale for layer i."""
+ return (None, ) * 4
+
+ def attn_norm(self, i: int):
+ """Get attn norm for layer i."""
+ return self.params[f'transformer.h.{i}.ln_1.weight']
+
+ def _ffn(self, i: int, kind: str):
+ """Get ffn kind for layer i."""
+ result = []
+ for key in ['w2', 'c_proj', 'w1']:
+ tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
+ result.append(tensor)
+ return (*result, )
+
+ def ffn(self, i: int):
+ """Get ffn weight for layer i."""
+ return self._ffn(i, 'weight')
+
+ def ffn_zero(self, i: int):
+ """Get ffn zero point for layer i."""
+ return (None, ) * 3
+
+ def ffn_scale(self, i: int):
+ """Get ffn scale for layer i."""
+ return (None, ) * 3
+
+ def ffn_norm(self, i: int):
+ """Get ffn norm for layer i."""
+ return self.params[f'transformer.h.{i}.ln_2.weight']
+
+
+@INPUT_MODELS.register_module(name='qwen')
+class QwenModel(LlamaModel):
+ """Qwen model in hf format."""
+
+ Reader = QwenReader
+
+ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+ super().__init__(model_path, tokenizer_path, **kwargs)
+
+ def tokenizer_info(self):
+ """Read tokenizer info."""
+ n_words = 151851
+ bos_id = 0
+ eos_id = 151643
+ return n_words, bos_id, eos_id
+
+ def model_info(self):
+ """Read model info."""
+ params_path = osp.join(self.model_path, 'config.json')
+ with open(params_path) as f:
+ config = json.load(f)
+ num_layer = config['num_hidden_layers']
+ norm_eps = config['layer_norm_epsilon']
+ rope_theta = float(config.get('rotary_emb_base', 10000.0))
+ if 'num_key_value_heads' in config:
+ kv_head_num = config['num_key_value_heads']
+ else:
+ kv_head_num = config['num_attention_heads']
+ seq_length = config['seq_length']
+ use_dynamic_ntk = int(config['use_dynamic_ntk'])
+ use_logn_attn = int(config['use_logn_attn'])
+ return dict(num_layer=num_layer,
+ norm_eps=norm_eps,
+ kv_head_num=kv_head_num,
+ rope_theta=rope_theta,
+ max_position_embeddings=seq_length,
+ use_dynamic_ntk=int(use_dynamic_ntk),
+ use_logn_attn=use_logn_attn)
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
new file mode 100644
index 0000000000..04df2ac729
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+from .qwen import QwenModel, QwenReader
+
+
+class QwenAwqReader(QwenReader):
+ """QwenAwqReader."""
+
+ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+ super().__init__(new_params, unused_params, last_bin)
+
+ def attn(self, i: int):
+ """Get q, k, v, o qweight for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+ def attn_bias(self, i: int):
+ """Get q, k, v, o bias for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'bias', -1, 0))
+
+ def attn_zero(self, i: int):
+ """Get q, k, v, o qzeros for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+ def attn_scale(self, i: int):
+ """Get q, k, v, o scales for layer i."""
+ return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+ def ffn(self, i: int):
+ """Get ffn qweight for layer i."""
+ # ours: w2(silu(w1(x)) * w3(x))
+ # qwen: c_proj(w1(x) * silu(w2(x)))
+ return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+ def ffn_zero(self, i: int):
+ """Get ffn qzeros for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+ def ffn_scale(self, i: int):
+ """Get ffn scales for layer i."""
+ return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='qwen-awq')
+class QwenAwqModel(QwenModel):
+ """Qwen awq model in hf format."""
+
+ Reader = QwenAwqReader
+
+ def __init__(self,
+ model_path: str,
+ tokenizer_path: str,
+ ckpt_path: str = None,
+ **kwargs):
+ super().__init__(model_path,
+ tokenizer_path,
+ ckpt_path=ckpt_path,
+ **kwargs)
diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py
new file mode 100644
index 0000000000..fe03500e45
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .fp import TurbomindModel # noqa: F401
+from .w4 import TurbomindW4Model # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
new file mode 100644
index 0000000000..5e9b5341f7
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import configparser
+import inspect
+import os.path as osp
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+import tqdm
+from mmengine import Registry
+
+from lmdeploy.model import MODELS
+
+from ..source_model.base import BaseInputModel, BaseReader
+
+OUTPUT_MODELS = Registry(
+ 'target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
+
+
+def tprint(*args, **kwargs):
+ from io import StringIO
+ s = StringIO()
+ print(*args, **kwargs, file=s, end='')
+ tqdm.tqdm.write(s.getvalue())
+
+
+@dataclass
+class TurbomindModelConfig:
+ """Config for turbomind model."""
+ model_name: str
+ tensor_para_size: int
+ head_num: int
+ kv_head_num: int
+ vocab_size: int
+ num_layer: int
+ inter_size: int
+ norm_eps: float
+ attn_bias: int
+ start_id: int
+ end_id: int
+ session_len: int
+ weight_type: str = 'fp16'
+ rotary_embedding: int = 128
+ rope_theta: float = 10000.0
+ size_per_head: int = 128
+ group_size: int = 0
+ max_batch_size: int = 32
+ max_context_token_num: int = 4
+ step_length: int = 1
+ cache_max_entry_count: int = 48
+ cache_chunk_size: int = 1
+ use_context_fmha: int = 1
+ quant_policy: int = 0
+ max_position_embeddings: int = 0
+ use_dynamic_ntk: int = 0
+ use_logn_attn: int = 0
+
+ @classmethod
+ def from_dict(cls, env, allow_none=False):
+ """Construct from dict."""
+ params = inspect.signature(cls).parameters
+ used = {k: v for k, v in env.items() if k in params and v is not None}
+ if not allow_none:
+ return cls(**used)
+ else:
+ default = {
+ k: None
+ for k in params.keys() if params[k].default is inspect._empty
+ }
+ default.update(used)
+ return cls(**default)
+
+ @property
+ def valid(self):
+ """Check if cfg is valid."""
+ for _, v in self.__dict__.items():
+ if v is None:
+ return False
+ return True
+
+
+class BaseOutputModel(ABC):
+ """Base output model."""
+
+ def __init__(self,
+ input_model: BaseInputModel,
+ cfg: TurbomindModelConfig,
+ to_file: bool = True,
+ out_dir: str = ''):
+ super().__init__()
+ self.input_model = input_model
+ self.cfg = self.get_config(cfg)
+ assert self.cfg.valid
+ self.to_file = to_file
+ self.out_dir = out_dir
+
+ @abstractmethod
+ def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
+ """Generate turbomind model config (config.ini)."""
+ _, bos_id, eos_id = self.input_model.tokenizer_info()
+ model = MODELS.get(cfg.model_name)()
+ final_cfg = cfg.__dict__
+ final_cfg.update(
+ dict(start_id=bos_id,
+ end_id=eos_id,
+ session_len=model.session_len + 8))
+ final_cfg.update(self.input_model.model_info())
+
+ # head_num, vocab_size
+ for bin in self.input_model.bins():
+ emb = bin.tok_embeddings()
+ if emb is not None:
+ _vocab_size, dim = emb.shape
+ head_num = dim // cfg.size_per_head
+ break
+ final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size))
+ return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
+
+ def export_config(self) -> None:
+ """export turbomind config."""
+ if self.to_file:
+ config = configparser.ConfigParser()
+ cfg = dict(llama=self.cfg.__dict__)
+ for section, key_values in cfg.items():
+ config[section] = key_values
+ config_path = osp.join(self.out_dir, 'config.ini')
+ with open(config_path, 'w') as f:
+ config.write(f)
+
+ def export_weight(self, param: torch.Tensor, name: str) -> None:
+ """export turbomind weight."""
+ if self.to_file:
+ if param.dtype in [torch.float, torch.bfloat16]:
+ param = param.half()
+ tprint(name, param.shape)
+ param.contiguous().cpu().numpy().tofile(
+ osp.join(self.out_dir, name))
+
+ def save_split(self,
+ tensor: torch.Tensor,
+ name: str,
+ split_dim=None,
+ copy=False) -> None:
+ """save split."""
+ tp = self.cfg.tensor_para_size
+ if split_dim is not None:
+ tprint(f'*** splitting {name}, shape={tensor.shape}, '
+ f'split_dim={split_dim}, tp={tp}')
+ assert tensor.shape[split_dim] % tp == 0
+ split_size = tensor.shape[split_dim] // tp
+ splits = torch.split(tensor, split_size, dim=split_dim)
+ for i, split in enumerate(splits):
+ prefix, ext = osp.splitext(name)
+ self.export_weight(split, f'{prefix}.{i}{ext}')
+ elif copy:
+ tprint(f'### copying {name}, shape={tensor.shape}')
+ copies = [tensor] * tp
+ for i, copy in enumerate(copies):
+ prefix, ext = osp.splitext(name)
+ self.export_weight(copy, f'{prefix}.{i}{ext}')
+ else:
+ self.export_weight(tensor, name)
+
+ def export(self) -> None:
+ """Export to turbomind model format."""
+ num_layer = self.cfg.num_layer
+ from tqdm import tqdm
+ pbar = tqdm(total=num_layer, desc='Convert to turbomind format')
+ self.export_config()
+ for bin in self.input_model.bins():
+ self.export_misc(bin)
+ for i in range(bin.start_layer_id, bin.end_layer_id):
+ self.export_transformer_block(bin, i)
+ pbar.update(1)
+ pbar.close()
+ # manually clean up meta reader
+ if hasattr(self.input_model, 'meta_reader'):
+ self.input_model.meta_reader.clean_up(True)
+ del self.input_model.meta_reader
+ torch.cuda.empty_cache()
+
+ def export_misc(self, bin: BaseReader) -> None:
+ """Export embedding, norm, output weight."""
+ emb = bin.tok_embeddings()
+ norm_weight = bin.norm_weight()
+ output_weight = bin.output_weight()
+
+ def pad_weight(tensor):
+ pad_size = None
+ vocab_size = self.cfg.vocab_size
+ tp = self.cfg.tensor_para_size
+ if vocab_size % tp != 0:
+ pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
+
+ if pad_size is None:
+ return tensor
+ return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
+ 'constant', 0)
+
+ if emb is not None:
+ emb = pad_weight(emb)
+ self.export_weight(emb, 'tok_embeddings.weight')
+ if norm_weight is not None:
+ self.export_weight(norm_weight, 'norm.weight')
+ if output_weight is not None:
+ output_weight = pad_weight(output_weight)
+ self.export_weight(output_weight, 'output.weight')
+
+ @abstractmethod
+ def export_transformer_block(self, bin: BaseReader, i: int) -> None:
+ """Export transformer block."""
+ pass
+
+
+def permute(x: torch.Tensor, size_per_head: int = 128):
+ if x.shape[-1] > 1:
+ dim = x.shape[-1]
+ n_heads = dim // size_per_head
+ return x.view(-1, n_heads, 2,
+ dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
+ else: # scales, zeros
+ dim = x.shape[0]
+ n_heads = dim // size_per_head
+ return x.view(n_heads, 2, dim // n_heads // 2,
+ 1).transpose(1, 2).reshape(dim, 1)
+
+
+def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
+ dim: int):
+
+ def reshape(x):
+ return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
+
+ qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
+ # (input_dim, head_num + 2 * kv_head_num)
+ return qkv.view(q.size(0), -1)
diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py
new file mode 100644
index 0000000000..d9a7783436
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+ merge_qkv, permute)
+
+
+def transpose_tensor(input: List[torch.Tensor]):
+ """Transpose tensor."""
+ output = [x.cuda().t() for x in input]
+ return output
+
+
+@OUTPUT_MODELS.register_module(name='fp16')
+class TurbomindModel(BaseOutputModel):
+ """Export to turbomind fp16 format."""
+
+ def __init__(self,
+ input_model: BaseInputModel,
+ cfg: TurbomindModelConfig,
+ to_file: bool = True,
+ out_dir: str = ''):
+ super().__init__(input_model, cfg, to_file, out_dir)
+
+ def get_config(self, cfg: TurbomindModelConfig):
+ """Get turbomind config."""
+ final_cfg = super().get_config(cfg).__dict__
+
+ # attn_bias, inter_size
+ visit = False
+ attn_bias = 0
+ for bin in self.input_model.bins():
+ for i in range(bin.start_layer_id, bin.end_layer_id):
+ visit = True
+ w1, _, _ = bin.ffn(i)
+ inter_size = w1.t().shape[-1]
+ qb, _, _, _ = bin.attn_bias(i)
+ if qb is not None:
+ attn_bias = 1
+ break
+ if visit:
+ break
+ final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+ return TurbomindModelConfig.from_dict(final_cfg)
+
+ def export_transformer_block(self, bin: BaseReader, i: int):
+ """Export transformer layer i."""
+ assert bin.start_layer_id <= i < bin.end_layer_id
+ tp = self.cfg.tensor_para_size
+ size_per_head = self.cfg.size_per_head
+ # attn
+ qw, kw, vw, ow = bin.attn(i)
+ qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
+ qw = permute(qw, size_per_head)
+ kw = permute(kw, size_per_head)
+ qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
+ self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
+ self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
+ qb, kb, vb, ob = bin.attn_bias(i)
+ if qb is not None:
+ qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
+ qb = permute(qb, size_per_head)
+ kb = permute(kb, size_per_head)
+ qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
+ self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+ self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
+ # ffn
+ w1, w2, w3 = bin.ffn(i)
+ w1, w2, w3 = transpose_tensor([w1, w2, w3])
+ self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
+ self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
+ self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
+ # norm
+ attn_norm = bin.attn_norm(i)
+ ffn_norm = bin.ffn_norm(i)
+ self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+ self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/deploy/target_model/w4.py b/lmdeploy/turbomind/deploy/target_model/w4.py
new file mode 100644
index 0000000000..282c7df607
--- /dev/null
+++ b/lmdeploy/turbomind/deploy/target_model/w4.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import sys
+
+import torch
+
+import lmdeploy
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+ merge_qkv, permute)
+
+# import _turbomind as _tm
+# TODO: find another way import _turbomind
+lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
+sys.path.append(osp.join(lmdeploy_dir, 'lib'))
+import _turbomind as _tm # noqa: E402
+
+
+def transpose_qk_s4(src: torch.Tensor, group_size):
+ assert src.is_contiguous()
+ dst = torch.zeros_like(src)
+ _tm.transpose_qk_s4_k_m8(src, dst,
+ src.size(-1) * 8, src.size(0), group_size)
+ return dst
+
+
+def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor,
+ w3_qw: torch.Tensor, w3_qz: torch.Tensor,
+ w3_s: torch.Tensor):
+
+ def fuse(a: torch.Tensor, b: torch.Tensor):
+ ab = torch.cat((a, b)).contiguous()
+ _ab = torch.zeros_like(ab)
+ _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
+ return _ab.view(a.size(0), -1)
+
+ w13_qw = fuse(w1_qw, w3_qw)
+ w13_qz = fuse(w1_qz, w3_qz)
+
+ w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
+ w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
+
+ return w13_qw, w13_qz, w13_s
+
+
+def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
+ group_size: int):
+ assert qw.is_contiguous()
+ assert qz.is_contiguous()
+ assert s.is_contiguous()
+ _qw = torch.zeros_like(qw)
+ _sz = torch.zeros_like(s, dtype=torch.int32) # half2
+ _ws = torch.zeros_like(s)
+ _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
+ qw.size(-1) * 8, qw.size(0), group_size)
+ return _qw, _sz
+
+
+def tp_m_s4(x: torch.Tensor, tp: int):
+ return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
+ 1).contiguous()
+
+
+def get_cuda_tensor(tensors):
+ """Get cuda tensor."""
+ result = map(lambda x: x.cuda() if x is not None else x, tensors)
+ return (*result, )
+
+
+@OUTPUT_MODELS.register_module(name='w4')
+class TurbomindW4Model(BaseOutputModel):
+ """Export to turbomind w4a16 format."""
+
+ def __init__(self,
+ input_model: BaseInputModel,
+ cfg: TurbomindModelConfig,
+ to_file: bool = True,
+ out_dir: str = ''):
+ super().__init__(input_model, cfg, to_file, out_dir)
+
+ def get_config(self, cfg: TurbomindModelConfig):
+ """Get turbomind config."""
+ final_cfg = super().get_config(cfg).__dict__
+
+ # attn_bias, inter_size
+ visit = False
+ attn_bias = 0
+ for bin in self.input_model.bins():
+ for i in range(bin.start_layer_id, bin.end_layer_id):
+ visit = True
+ w1s, _, _ = bin.ffn_scale(i)
+ inter_size = w1s.shape[-1]
+ qb, _, _, _ = bin.attn_bias(i)
+ if qb is not None:
+ attn_bias = 1
+ break
+ if visit:
+ break
+ final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+ return TurbomindModelConfig.from_dict(final_cfg)
+
+ def export_transformer_block(self, bin: BaseReader, i: int):
+ """Export transformer layer i."""
+ group_size = self.cfg.group_size
+ tp = self.cfg.tensor_para_size
+ size_per_head = self.cfg.size_per_head
+ # attn
+ q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
+ q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
+ q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
+
+ q_qw = transpose_qk_s4(q_qw, group_size)
+ k_qw = transpose_qk_s4(k_qw, group_size)
+ q_qz = transpose_qk_s4(q_qz, group_size)
+ k_qz = transpose_qk_s4(k_qz, group_size)
+ q_s = permute(q_s, size_per_head)
+ k_s = permute(k_s, size_per_head)
+
+ qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
+ qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
+ qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
+
+ qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
+ qkv_qw = tp_m_s4(qkv_qw, tp)
+ self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
+ self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
+
+ o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
+ self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
+ self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
+
+ q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
+ if q_b is not None:
+ q_b = permute(q_b, size_per_head)
+ k_b = permute(k_b, size_per_head)
+ qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
+ self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+ self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
+
+ # ffn weights
+ w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
+ w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
+ w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
+
+ w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
+ w3_s)
+ w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
+ w13_qw = tp_m_s4(w13_qw, tp)
+ self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
+ self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
+ -1)
+
+ w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
+ self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
+ self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
+
+ # norm
+ attn_norm = bin.attn_norm(i)
+ ffn_norm = bin.ffn_norm(i)
+ self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+ self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py
index 328f182158..9a4f0e8c4d 100644
--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
@@ -2,8 +2,6 @@
import subprocess
-import fire
-
def get_llama_gemm():
import os.path as osp
@@ -30,4 +28,6 @@ def main(head_num: int = 32,
if __name__ == '__main__':
+ import fire
+
fire.Fire(main)
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index dcfc499e89..9d2186fea9 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -13,7 +13,7 @@
from torch.nn.utils.rnn import pad_sequence
import lmdeploy
-from lmdeploy.model import MODELS
+from lmdeploy.model import MODELS, BaseModel
from lmdeploy.tokenizer import Tokenizer
from lmdeploy.utils import get_logger
@@ -78,7 +78,11 @@ class TurboMind:
tp (int): tensor parallel
"""
- def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
+ def __init__(self,
+ model_path: str,
+ eos_id: int = 2,
+ tp: int = 1,
+ **kwargs):
self.eos_id = eos_id
# TODO: support mpi
@@ -88,7 +92,6 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
# read meta from model path
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
self.gpu_count = tp
- self.session_len = 2048
data_type = 'fp16'
ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
with open(ini_path, 'r') as f:
@@ -102,18 +105,18 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
if len(section_name) > 0:
tp_cfg = parser.getint(section_name, 'tensor_para_size')
- self.session_len = parser.getint(section_name, 'session_len')
if tp_cfg != 1 and tp_cfg != tp:
get_logger('turbomind').info(
f'found tp={tp_cfg} in config.ini.')
self.gpu_count = tp_cfg
self.model_name = parser.get(section_name, 'model_name')
data_type = parser.get(section_name, 'weight_type')
- model = MODELS.get(self.model_name)()
+ self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
+ self.session_len = self.model.session_len
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
- self.stop_words = _stop_words(model.stop_words, tokenizer)
+ self.stop_words = _stop_words(self.model.stop_words, tokenizer)
# params
self.node_id = node_id
@@ -122,17 +125,17 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
# create model
weight_dir = osp.join(model_path, 'triton_models', 'weights')
- model = _tm.AbstractTransformerModel.create_llama_model(
+ model_comm = _tm.AbstractTransformerModel.create_llama_model(
weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
- self.model = model
- self.nccl_params = model.create_nccl_params(self.node_id)
+ self.model_comm = model_comm
+ self.nccl_params = model_comm.create_nccl_params(self.node_id)
torch.cuda.synchronize()
# create weight
def _create_weight(device_id):
with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id
- model.create_shared_weights(device_id, rank)
+ model_comm.create_shared_weights(device_id, rank)
threads = []
for device_id in range(self.gpu_count):
@@ -161,7 +164,7 @@ class TurboMindInstance:
cuda_stream_id(int): identity of a cuda stream
"""
- def __init__(self, tm_model, cuda_stream_id=0):
+ def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
self.tm_model = tm_model
self.cuda_stream_id = cuda_stream_id
@@ -175,7 +178,7 @@ def __init__(self, tm_model, cuda_stream_id=0):
self.session_len = tm_model.session_len
self.nccl_params = tm_model.nccl_params
- self.instance_comm = tm_model.model.create_instance_comm(
+ self.instance_comm = tm_model.model_comm.create_instance_comm(
self.gpu_count)
# create model instances
@@ -196,7 +199,7 @@ def __init__(self, tm_model, cuda_stream_id=0):
def _create_model_instance(self, device_id, model_insts):
with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id
- model_inst = self.tm_model.model.create_model_instance(
+ model_inst = self.tm_model.model_comm.create_model_instance(
device_id, rank, self.cuda_stream_id, self.nccl_params)
model_insts[device_id] = model_inst
@@ -266,7 +269,7 @@ def stream_infer(self,
self.model_insts[0].register_callback(self._forward_callback)
if len(input_ids) == 0:
- input_ids = []
+ input_ids = [[]]
if isinstance(input_ids[0], int):
input_ids = [input_ids]
@@ -381,7 +384,7 @@ def decode(self, input_ids):
"""
if len(input_ids) == 0:
- input_ids = []
+ input_ids = [[]]
if isinstance(input_ids[0], int):
input_ids = [input_ids]
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 417dc76768..0bd4914cc4 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
-__version__ = '0.0.11'
+__version__ = '0.0.14'
short_version = __version__
diff --git a/requirements.txt b/requirements.txt
index 9eacb498fb..27049672c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ accelerate
datasets
fastapi
fire
-gradio
+gradio<4.0.0
mmengine
numpy
pybind11
diff --git a/setup.py b/setup.py
index 09ae1e31c2..df36118c23 100644
--- a/setup.py
+++ b/setup.py
@@ -121,26 +121,29 @@ def gen_packages_items():
if __name__ == '__main__':
lmdeploy_package_data = ['lmdeploy/bin/llama_gemm']
- setup(name='lmdeploy',
- version=get_version(),
- description='A toolset for compressing, deploying and serving LLM',
- long_description=readme(),
- long_description_content_type='text/markdown',
- author='OpenMMLab',
- author_email='openmmlab@gmail.com',
- packages=find_packages(exclude=()),
- package_data={
- 'lmdeploy': lmdeploy_package_data,
- },
- include_package_data=True,
- install_requires=parse_requirements('requirements.txt'),
- has_ext_modules=check_ext_modules,
- classifiers=[
- 'Programming Language :: Python :: 3.8',
- 'Programming Language :: Python :: 3.9',
- 'Programming Language :: Python :: 3.10',
- 'Programming Language :: Python :: 3.11',
- 'Intended Audience :: Developers',
- 'Intended Audience :: Education',
- 'Intended Audience :: Science/Research',
- ])
+ setup(
+ name='lmdeploy',
+ version=get_version(),
+ description='A toolset for compressing, deploying and serving LLM',
+ long_description=readme(),
+ long_description_content_type='text/markdown',
+ author='OpenMMLab',
+ author_email='openmmlab@gmail.com',
+ packages=find_packages(exclude=()),
+ package_data={
+ 'lmdeploy': lmdeploy_package_data,
+ },
+ include_package_data=True,
+ install_requires=parse_requirements('requirements.txt'),
+ has_ext_modules=check_ext_modules,
+ classifiers=[
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: Python :: 3.10',
+ 'Programming Language :: Python :: 3.11',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Education',
+ 'Intended Audience :: Science/Research',
+ ],
+ entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']},
+ )
diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py
new file mode 100644
index 0000000000..a41eab442e
--- /dev/null
+++ b/tests/test_lmdeploy/test_cli.py
@@ -0,0 +1,51 @@
+import inspect
+
+
+def compare_func(class_method, function):
+ """Compare if a class method has same arguments as a function."""
+
+ argspec_cls = inspect.getfullargspec(class_method)
+ argspec_func = inspect.getfullargspec(function)
+ assert argspec_cls.args[1:] == argspec_func.args
+ assert argspec_cls.defaults == argspec_func.defaults
+ assert argspec_cls.annotations == argspec_func.annotations
+
+
+def test_cli():
+
+ from lmdeploy.cli.cli import CLI
+ from lmdeploy.serve.turbomind.deploy import main as convert
+ compare_func(CLI.convert, convert)
+
+
+def test_subcli_chat():
+ from lmdeploy.cli.chat import SubCliChat
+ from lmdeploy.pytorch.chat import main as run_torch_model
+ from lmdeploy.turbomind.chat import main as run_turbomind_model
+
+ compare_func(SubCliChat.torch, run_torch_model)
+ compare_func(SubCliChat.turbomind, run_turbomind_model)
+
+
+def test_subcli_lite():
+ from lmdeploy.cli.lite import SubCliLite
+ from lmdeploy.lite.apis.auto_awq import auto_awq
+ from lmdeploy.lite.apis.calibrate import calibrate
+ from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+
+ compare_func(SubCliLite.auto_awq, auto_awq)
+ compare_func(SubCliLite.calibrate, calibrate)
+ compare_func(SubCliLite.kv_qparams, run_kv_qparams)
+
+
+def test_subcli_serve():
+ from lmdeploy.cli.serve import SubCliServe
+ from lmdeploy.serve.client import main as run_triton_client
+ from lmdeploy.serve.gradio.app import run as run_gradio
+ from lmdeploy.serve.openai.api_client import main as run_api_client
+ from lmdeploy.serve.openai.api_server import main as run_api_server
+
+ compare_func(SubCliServe.gradio, run_gradio)
+ compare_func(SubCliServe.api_server, run_api_server)
+ compare_func(SubCliServe.api_client, run_api_client)
+ compare_func(SubCliServe.triton_client, run_triton_client)
diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py
new file mode 100644
index 0000000000..ff7d8047b2
--- /dev/null
+++ b/tests/test_lmdeploy/test_tokenizer.py
@@ -0,0 +1,24 @@
+import pytest
+
+from lmdeploy.tokenizer import HuggingFaceTokenizer
+
+
+@pytest.mark.parametrize('model_path', [
+ 'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
+ 'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf',
+ 'upstage/SOLAR-0-70b-16bit'
+])
+@pytest.mark.parametrize(
+ 'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5])
+def test_tokenizer(model_path, input):
+ tokenizer = HuggingFaceTokenizer(model_path)
+ encoded = tokenizer.encode(input)
+ output = ''
+ offset = 0
+ for i in range(1, len(encoded) + 1):
+ decoded = tokenizer.decode(encoded[:i], offset)
+ if decoded.endswith('�'):
+ continue
+ output += decoded
+ offset = i
+ assert input == output, 'input string should equal to output after enc-dec'