diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml index 86838836de..d9e6956735 100644 --- a/.github/ISSUE_TEMPLATE/1-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -25,6 +25,18 @@ body: A placeholder for the command. validations: required: true +- type: textarea + attributes: + label: Environment + description: | + 1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here. + 2. You may add addition that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + placeholder: Environment here. + render: Shell + validations: + required: true - type: textarea attributes: label: Error traceback diff --git a/README.md b/README.md index a2de4d6ac0..7639675aba 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by ## Supported Models -`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. +`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names. ### TurboMind @@ -63,6 +63,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by | :----------: | :-------------: | :--: | :-----: | :---: | :--: | | Llama | Yes | Yes | Yes | Yes | No | | Llama2 | Yes | Yes | Yes | Yes | No | +| SOLAR | Yes | Yes | Yes | Yes | No | | InternLM-7B | Yes | Yes | Yes | Yes | No | | InternLM-20B | Yes | Yes | Yes | Yes | No | | QWen-7B | Yes | Yes | Yes | No | No | @@ -118,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl GIT_LFS_SKIP_SMUDGE=1 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` #### Inference by TurboMind ```shell -python -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` > **Note**
@@ -139,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace #### Serving with gradio ```shell -python3 -m lmdeploy.serve.gradio.app ./workspace +lmdeploy serve gradio ./workspace ``` ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) @@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace Launch inference server by: ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --instance_num 32 --tp 1 ``` Then, you can communicate with it by command line, ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client api_server_url ``` or webui, ```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +# api_server_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 +lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} ``` Refer to [restful_api.md](docs/en/restful_api.md) for more details. @@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh Then, you can communicate with the inference server by command line, ```shell -python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 +lmdeploy serve triton_client {server_ip_addresss}:33337 ``` or webui, ```shell -python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337 +lmdeploy serve gradio {server_ip_addresss}:33337 ``` For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md) @@ -199,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc #### Single GPU ```shell -python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \ +lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \ --max_new_tokens 64 \ --temperture 0.8 \ --top_p 0.95 \ diff --git a/README_zh-CN.md b/README_zh-CN.md index 09c66c2826..38faad0583 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -53,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht ## 支持的模型 -`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端 +`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表 ### TurboMind @@ -64,6 +64,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht | :----------: | :------: | :--: | :-----: | :---: | :--: | | Llama | Yes | Yes | Yes | Yes | No | | Llama2 | Yes | Yes | Yes | Yes | No | +| SOLAR | Yes | Yes | Yes | Yes | No | | InternLM-7B | Yes | Yes | Yes | Yes | No | | InternLM-20B | Yes | Yes | Yes | Yes | No | | QWen-7B | Yes | Yes | Yes | No | No | @@ -119,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl GIT_LFS_SKIP_SMUDGE=1 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` #### 使用 turbomind 推理 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` > **Note**
@@ -139,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace #### 启动 gradio server ```shell -python3 -m lmdeploy.serve.gradio.app ./workspace +lmdeploy serve gradio ./workspace ``` ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) @@ -149,23 +150,23 @@ python3 -m lmdeploy.serve.gradio.app ./workspace 使用下面的命令启动推理服务: ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` 你可以通过命令行方式与推理服务进行对话: ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client api_server_url ``` 也可以通过 WebUI 方式来对话: ```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +# api_server_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 +lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} ``` 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。 @@ -181,13 +182,13 @@ bash workspace/service_docker_up.sh 你可以通过命令行方式与推理服务进行对话: ```shell -python3 -m lmdeploy.serve.client {server_ip_addresss}:33337 +lmdeploy serve triton_client {server_ip_addresss}:33337 ``` 也可以通过 WebUI 方式来对话: ```shell -python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337 +lmdeploy serve gradio {server_ip_addresss}:33337 ``` 其他模型的部署方式,比如 LLaMA,LLaMA-2,vicuna等等,请参考[这里](docs/zh_cn/serving.md) @@ -203,7 +204,7 @@ pip install deepspeed #### 单个 GPU ```shell -python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\ +lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\ --max_new_tokens 64 \ --temperture 0.8 \ --top_p 0.95 \ diff --git a/benchmark/README.md b/benchmark/README.md index b5573ae2b8..3fa117210e 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -30,7 +30,7 @@ pip install nvidia-ml-py ```bash python profile_generation.py \ --model-path /path/to/your/model \ - --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512 + --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512 ``` ## profile serving diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py index e64a6708cd..325877f4e3 100644 --- a/benchmark/profile_generation.py +++ b/benchmark/profile_generation.py @@ -106,7 +106,7 @@ def _infer(model, session_id): def profile_throughput(model_path: str, concurrency: int = 1, - input_seqlen: int = 0, + input_seqlen: int = 1, output_seqlen: int = 512, test_round: int = 10, tp: int = 1, @@ -133,8 +133,10 @@ def profile_throughput(model_path: str, ) # make up a prompt that can be tokenized into {input_seqlen} tokens - prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1) + assert input_seqlen > 0, 'input_seqlen should > 0' + prompt = 'hi' input_ids = tokenizer.encode(prompt) + input_ids = input_ids * input_seqlen warmup(tm_model, concurrency, diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index d1f6ebf80e..394c7ec1b9 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -1,98 +1,73 @@ import json -import multiprocessing as mp import random import time -from typing import Iterable, List +from queue import Queue +from threading import Thread import fire import numpy as np -import requests +from lmdeploy.serve.openai.api_client import get_streaming_response from lmdeploy.tokenizer import Tokenizer -from lmdeploy.utils import get_logger - - -def get_streaming_response(prompt: str, - api_url: str, - session_id: int, - request_output_len: int, - stream: bool = True, - sequence_start: bool = True, - sequence_end: bool = False, - ignore_eos: bool = False) -> Iterable[List[str]]: - headers = {'User-Agent': 'Test Client'} - pload = { - 'prompt': prompt, - 'stream': stream, - 'session_id': session_id, - 'request_output_len': request_output_len, - 'sequence_start': sequence_start, - 'sequence_end': sequence_end, - 'ignore_eos': ignore_eos - } - response = requests.post(api_url, - headers=headers, - json=pload, - stream=stream) - for chunk in response.iter_lines(chunk_size=8192, - decode_unicode=False, - delimiter=b'\n'): - if chunk: - data = json.loads(chunk.decode('utf-8')) - output = data['text'] - tokens = data['tokens'] - yield output, tokens - - -def infer(server_addr: str, session_id: int, req_queue: mp.Queue, - res_que: mp.Queue): + + +def infer(server_addr: str, session_id: int, req_queue: Queue, res_que: Queue, + stream_output: bool): stats = [] - while not req_queue.empty(): - prompt, input_seqlen, output_seqlen = req_queue.get() - get_logger('profile_restful_api').info( - f'request info: session {session_id}, ' - f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}') + for prompt, input_seqlen, output_seqlen in iter(req_queue.get, + [None, None, None]): + if prompt is None: + break timestamps = [] tokens = [] - start = time.perf_counter() - for res, token in get_streaming_response( + timestamps.append(time.perf_counter()) + for res, token, status in get_streaming_response( prompt, server_addr, session_id, request_output_len=output_seqlen, - sequence_start=True, - sequence_end=True): + interactive_mode=False, + ignore_eos=True, + stream=stream_output): timestamps.append(time.perf_counter()) tokens.append(token) - first_token_latency = timestamps[1] - start - token_latency = timestamps[-1] - timestamps[0] - token = tokens[-1] - tokens[0] - stats.append([first_token_latency, token, token_latency]) + first_token_latency = np.round(timestamps[1] - timestamps[0], 3) + token_latency = np.round(timestamps[-1] - timestamps[0], 3) + completion_tokens = tokens[-1] + total_tokens = tokens[-1] + input_seqlen + stats.append([ + first_token_latency, completion_tokens, output_seqlen, + total_tokens, token_latency + ]) + print(f'session {session_id}: ' + f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, ' + f'completion_tokens {completion_tokens}') res_que.put((session_id, stats)) def warmup(server_addr: str, concurrency: int, output_seqlen: int, - warmup_round: int = 1): + warmup_round: int = 1, + stream_output: bool = False): print('start to warmup ...') def _infer(server_addr, session_id): for _ in range(warmup_round): - for _, _ in get_streaming_response( - '', - server_addr, - session_id, - request_output_len=output_seqlen, - sequence_start=True, - sequence_end=True): + for _ in get_streaming_response('', + server_addr, + session_id, + request_output_len=output_seqlen, + interactive_mode=False, + stream=stream_output, + ignore_eos=True): continue _start = time.perf_counter() procs = [] for i in range(concurrency): - proc = mp.Process(target=_infer, args=(server_addr, i + 1)) + proc = Thread(target=_infer, args=(server_addr, i + 1)) procs.append(proc) proc.start() for proc in procs: @@ -115,6 +90,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int, print(f'elapsed time for read data: ' f'{round(time.perf_counter() - start, 2)} s') + print('start tokenization. This takes a while, please wait...') start = time.perf_counter() tokenizer = Tokenizer(tokenizer_path) prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts] @@ -136,9 +112,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int, if samples > 0: filtered_dataset = random.sample(filtered_dataset, samples) - que = mp.Queue() + que = Queue() for data in filtered_dataset: que.put(data) + que.put((None, None, None)) print(f'elapsed time for filtering: ' f'{round(time.perf_counter() - start, 2)} s') return que, len(filtered_dataset) @@ -149,17 +126,20 @@ def main(server_addr: str, dataset_path: str, concurrency: int = 1, session_len: int = 2048, - samples: int = 1000): - api_url = server_addr + '/generate' - warmup(api_url, concurrency, session_len - 1) + samples: int = 1000, + stream_output: bool = False): + api_url = server_addr + '/v1/chat/interactive' + warmup(api_url, concurrency, session_len - 1, 4, stream_output) req_queue, n_req = read_dataset(tokenizer_path, dataset_path, samples, session_len) - res_que = mp.Queue() + for i in range(concurrency): + req_queue.put([None, None, None]) + res_que = Queue() procs = [] _start = time.perf_counter() for i in range(concurrency): - proc = mp.Process(target=infer, - args=(api_url, i + 1, req_queue, res_que)) + proc = Thread(target=infer, + args=(api_url, i + 1, req_queue, res_que, stream_output)) procs.append(proc) proc.start() for proc in procs: @@ -174,22 +154,40 @@ def main(server_addr: str, f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n') stats.append(np.array(_stats)) - stats = np.concatenate(stats).reshape(-1, 3) + stats = np.concatenate(stats).reshape(-1, 5) first_token_latency_min = np.min(stats[:, 0], axis=0) first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) - token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time - req_throughput = n_req / elapsed_time + completion_tokens = np.sum(stats[:, 1], axis=0) + request_output_tokens = np.sum(stats[:, 2], axis=0) + total_tokens = np.sum(stats[:, 3], axis=0) + prompt_tokens = total_tokens - completion_tokens + completion_token_throughput = completion_tokens / elapsed_time + total_token_throughput = total_tokens / elapsed_time + rqs = n_req / elapsed_time + rqm = rqs * 60 + + if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False: + print(f'Did not generate requested number of tokens. ' + f'Request {request_output_tokens:.0f}, ' + f'but got {completion_tokens:.0f}') print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.2f}s\n' - f'first_token latency(min, max, ave): ' - f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, ' - f'{first_token_latency_ave:.2f}s\n' - f'token throughput: {token_throughput:.2f} token/s\n' - f'req throughput: {req_throughput:.2f} req/s\n' - f'{"-" * 50}\n') + f'elapsed_time: {elapsed_time:.3f}s\n') + if stream_output: + print(f'first_token latency(min, max, ave): ' + f'{first_token_latency_min:.3f}s, ' + f'{first_token_latency_max:.3f}s, ' + f'{first_token_latency_ave:.3f}s\n') + print( + f'number of prompt tokens: {prompt_tokens:.0f}\n' + f'number of completion tokens: {completion_tokens:.0f}\n' + f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa + f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa + f'RPS (request per second): {rqs:.3f} req/s\n' + f'RPM (request per minute): {rqm:.3f} req/min\n' + f'{"-" * 50}\n') if __name__ == '__main__': diff --git a/benchmark/profile_serving.py b/benchmark/profile_serving.py index 4580757eeb..ee23452d8a 100644 --- a/benchmark/profile_serving.py +++ b/benchmark/profile_serving.py @@ -17,7 +17,7 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue): [None, None, None]): timestamps = [] tokens = [] - start = time.perf_counter() + timestamps.append(time.perf_counter()) for status, res, token in chatbot.stream_infer( session_id, prompt, @@ -26,13 +26,17 @@ def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue): sequence_end=True): timestamps.append(time.perf_counter()) tokens.append(token) - - first_token_latency = np.round(timestamps[1] - start, 3) + first_token_latency = np.round(timestamps[1] - timestamps[0], 3) token_latency = np.round(timestamps[-1] - timestamps[0], 3) - token = tokens[-1] - tokens[0] - stats.append([first_token_latency, token, token_latency]) + completion_tokens = tokens[-1] + total_tokens = tokens[-1] + input_seqlen + stats.append([ + first_token_latency, completion_tokens, output_seqlen, + total_tokens, token_latency + ]) print(f'session {session_id}: ' - f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}') + f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, ' + f'completion_tokens {completion_tokens}') res_que.put((session_id, stats)) @@ -84,6 +88,7 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int, completions = [completion for _, completion in dataset] print(f'elapsed time for read data: ' f'{round(time.perf_counter() - start, 2)} s') + print('start tokenization. This takes a while, please wait...') start = time.perf_counter() tokenizer = Tokenizer(tokenizer_path) @@ -124,7 +129,6 @@ def main(tritonserver_addr: str, res_que = mp.Queue() procs = [] - _start = time.perf_counter() for i in range(concurrency): chatbot = Chatbot(tritonserver_addr=tritonserver_addr, display=False, @@ -134,13 +138,15 @@ def main(tritonserver_addr: str, proc = mp.Process(target=infer, args=(chatbot, i + 1, req_que, res_que)) procs.append(proc) - proc.start() # read data and put it to queue n_req = read_dataset(tokenizer_path, dataset_path, samples, session_len, req_que) for i in range(concurrency): req_que.put([None, None, None]) + _start = time.perf_counter() + for proc in procs: + proc.start() stats = [] for i in range(concurrency): @@ -149,27 +155,42 @@ def main(tritonserver_addr: str, f'session {session_id}: processed reqs {len(_stats)}, ' f'stats: \n{_stats}\n{"-" * 50}\n') stats.append(np.array(_stats)) - _end = time.perf_counter() + elapsed_time = _end - _start - stats = np.concatenate(stats).reshape(-1, 3) + stats = np.concatenate(stats).reshape(-1, 5) first_token_latency_min = np.min(stats[:, 0], axis=0) first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) - token_throughput = np.sum(stats[:, 1], axis=0) / elapsed_time - req_throughput = n_req / elapsed_time - - print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' - f'elapsed_time: {elapsed_time:.3f}s\n' - f'first_token latency(min, max, ave): ' - f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, ' - f'{first_token_latency_ave:.3f}s\n' - f'token throughput: {token_throughput:.3f} token/s\n' - f'req throughput: {req_throughput:.3f} req/s\n' - f'{"-" * 50}\n') - + completion_tokens = np.sum(stats[:, 1], axis=0) + request_output_tokens = np.sum(stats[:, 2], axis=0) + total_tokens = np.sum(stats[:, 3], axis=0) + prompt_tokens = total_tokens - completion_tokens + completion_token_throughput = completion_tokens / elapsed_time + total_token_throughput = total_tokens / elapsed_time + rqs = n_req / elapsed_time + rqm = rqs * 60 + + if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False: + print(f'Did not generate requested number of tokens. ' + f'Request {request_output_tokens:.0f}, ' + f'but got {completion_tokens:.0f}') + + print( + f'\n{"-" * 50}\nconcurrency: {concurrency}\n' + f'elapsed_time: {elapsed_time:.3f}s\n' + f'first_token latency(min, max, ave): ' + f'{first_token_latency_min:.3f}s, {first_token_latency_max:.3f}s, ' + f'{first_token_latency_ave:.3f}s\n' + f'number of prompt tokens: {prompt_tokens:.0f}\n' + f'number of completion tokens: {completion_tokens:.0f}\n' + f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa + f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa + f'RPS (request per second): {rqs:.3f} req/s\n' + f'RPM (request per minute): {rqm:.3f} req/min\n' + f'{"-" * 50}\n') for proc in procs: proc.join() diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 610fbb7657..77a0b6f242 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -8,6 +8,7 @@ from typing import List, Tuple import fire +import numpy as np from lmdeploy.tokenizer import Tokenizer @@ -80,88 +81,137 @@ def __init__(self, model_path: str, tp: int = 1): self.tm_model = tm_model self.tokenizer = tokenizer - def _inference(self, queue, session_id: int): - + def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, + stream_output: bool): model_inst = self.tm_model.create_instance() - while True: - request = queue.get() - if request is None: - # stop signal - queue.put(None) - return - else: - prompt, _, output_seqlen = request - input_ids = self.tokenizer.encode(prompt) - - for outputs in model_inst.stream_infer( - session_id, - input_ids=input_ids, - request_output_len=output_seqlen, - temperature=1.0, - top_p=1.0, - sequence_start=True, - sequence_end=True, - ignore_eos=True, - sampling_param=self.sampling_param): - if len(outputs) > 1: - res, tokens = outputs[-2:] - else: - res, tokens = outputs[0] - self.tokenizer.decode(res) - - # for pytorch engine to restart a session - if hasattr(model_inst, 'end'): - model_inst.end(session_id) - - def process_request(self, requests, concurrency: int = 1): - q = Queue() + stats = [] + timestamps = [] + tokens = [] + timestamps.append(time.perf_counter()) + for prompt, input_seqlen, output_seqlen in iter( + req_queue.get, [None, None, None]): + input_ids = self.tokenizer.encode(prompt) + offset = 0 + for outputs in model_inst.stream_infer( + session_id, + input_ids=input_ids, + request_output_len=output_seqlen, + temperature=1.0, + top_p=1.0, + sequence_start=True, + sequence_end=True, + ignore_eos=True, + stream_output=stream_output): + if len(outputs) > 1: + res, token = outputs[-2:] + else: + res, token = outputs[0] + self.tokenizer.decode(res, offset) + offset = token + timestamps.append(time.perf_counter()) + tokens.append(token) + # for pytorch engine to restart a session + if hasattr(model_inst, 'end'): + model_inst.end(session_id) + first_token_latency = np.round(timestamps[1] - timestamps[0], 3) + token_latency = np.round(timestamps[-1] - timestamps[0], 3) + completion_tokens = tokens[-1] + total_tokens = tokens[-1] + len(input_ids) + stats.append([ + first_token_latency, completion_tokens, output_seqlen, + total_tokens, token_latency + ]) + print( + f'session {session_id}: ' + f'input_seqlen {input_seqlen}, output_seqlen {output_seqlen}, ' + f'completion_tokens {completion_tokens}') + res_queue.put((session_id, stats)) + + def process_request(self, + requests, + concurrency: int = 1, + stream_output: bool = True): + res_queue = Queue() + req_queue = Queue() threads = [] + # feed request to q + for req in requests: + req_queue.put(req) + for i in range(concurrency): + req_queue.put([None, None, None]) + start = time.time() # start threads for i in range(concurrency): - t = Thread(target=self._inference, args=(q, i)) + t = Thread(target=self._inference, + args=(req_queue, res_queue, i, stream_output)) t.start() threads.append(t) - # feed request to q - for req in requests: - q.put(req) - - q.put(None) - # wait for finish for t in threads: t.join() - end = time.time() - - return end - start + elapsed_time = time.time() - start + + stats = [] + while not res_queue.empty(): + session_id, _stats = res_queue.get() + print(f'\n{"-" * 50}\n' + f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n') + stats.append(np.array(_stats)) + + stats = np.concatenate(stats).reshape(-1, 5) + + first_token_latency_min = np.min(stats[:, 0], axis=0) + first_token_latency_max = np.max(stats[:, 0], axis=0) + first_token_latency_ave = np.mean(stats[:, 0], axis=0) + completion_tokens = np.sum(stats[:, 1], axis=0) + request_output_tokens = np.sum(stats[:, 2], axis=0) + total_tokens = np.sum(stats[:, 3], axis=0) + prompt_tokens = total_tokens - completion_tokens + completion_token_throughput = completion_tokens / elapsed_time + total_token_throughput = total_tokens / elapsed_time + rqs = len(requests) / elapsed_time + rqm = rqs * 60 + + if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False: + print(f'Did not generate requested number of tokens. ' + f'Request {request_output_tokens:.0f}, ' + f'but got {completion_tokens:.0f}') + + print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n' + f'elapsed_time: {elapsed_time:.3f}s\n') + if stream_output: + print(f'first_token latency(min, max, ave): ' + f'{first_token_latency_min:.3f}s, ' + f'{first_token_latency_max:.3f}s, ' + f'{first_token_latency_ave:.3f}s\n') + print( + f'number of prompt tokens: {prompt_tokens:.0f}\n' + f'number of completion tokens: {completion_tokens:.0f}\n' + f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa + f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa + f'RPS (request per second): {rqs:.3f} req/s\n' + f'RPM (request per minute): {rqm:.3f} req/min\n' + f'{"-" * 50}\n') def main(dataset: str, model_path: str, concurrency: int = 1, num_prompts: int = 1000, - tp: int = 1): + tp: int = 1, + stream_output: bool = True): engine = Engine(model_path, tp=tp) tokenizer = engine.tokenizer requests = sample_requests(dataset, num_prompts, tokenizer) - elapsed_time = engine.process_request(requests, concurrency) - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_num_out_tokens = sum(output_len for _, _, output_len in requests) - print(f'Throughput requests: {len(requests) / elapsed_time:.2f} req/s') - print( - f'Throughput requests: {len(requests) * 60 / elapsed_time:.2f} req/min' - ) - print(f'Throughput tokens: {total_num_tokens / elapsed_time:.2f} tokens/s') - print('Throughput tokens(output only):' - f'{total_num_out_tokens / elapsed_time:.2f} tokens/s') + engine.process_request(requests, concurrency, stream_output) if __name__ == '__main__': diff --git a/builder/manywheel/entrypoint_build.sh b/builder/manywheel/entrypoint_build.sh index abb90562a2..8d1eb16de9 100755 --- a/builder/manywheel/entrypoint_build.sh +++ b/builder/manywheel/entrypoint_build.sh @@ -11,7 +11,7 @@ source /opt/conda/bin/activate conda activate $PYTHON_VERSION cd lmdeploy -mkdir build && cd build +mkdir -p build && cd build && rm -rf * bash ../generate.sh make -j$(nproc) && make install if [ $? != 0 ]; then diff --git a/docs/en/build.md b/docs/en/build.md index 7ee53ac90c..cb278073c9 100644 --- a/docs/en/build.md +++ b/docs/en/build.md @@ -1,22 +1,79 @@ -## Build from source +# Build from source -- install packages for compiling and running: +LMDeploy provides prebuilt package that can be easily installed by `pip install lmdeploy`. - ```shell - conda create -n lmdeploy python=3.10 - conda activate lmdeploy +If you have requests to build lmdeploy from source, please clone lmdeploy repository from GitHub, and follow instructions in next sections - git clone https://github.com/InternLM/lmdeploy.git - cd lmdeploy +```shell +git clone --depth=1 https://github.com/InternLM/lmdeploy +``` - pip install -r requirements.txt - conda install openmpi-mpicxx nccl rapidjson -c conda-forge - ``` +## Build in Docker (recommended) + +We highly advise using the provided docker image for lmdeploy build to circumvent complex environment setup. + +The docker image is `openmmlab/lmdeploy-builder:cuda11.8`. Make sure that docker is installed before using this image. + +In the root directory of the lmdeploy source code, please run the following command: + +```shell +cd lmdeploy # the home folder of lmdeploy source code +bash builder/manywheel/build_all_wheel.sh +``` + +All the wheel files for lmdeploy under py3.8 - py3.11 will be found in the `builder/manywheel/cuda11.8_dist` directory, such as, + +```text +builder/manywheel/cuda11.8_dist/ +├── lmdeploy-0.0.12-cp310-cp310-manylinux2014_x86_64.whl +├── lmdeploy-0.0.12-cp311-cp311-manylinux2014_x86_64.whl +├── lmdeploy-0.0.12-cp38-cp38-manylinux2014_x86_64.whl +└── lmdeploy-0.0.12-cp39-cp39-manylinux2014_x86_64.whl +``` + +If the wheel file for a specific Python version is required, such as py3.8, please execute: + +```shell +bash builder/manywheel/build_wheel.sh py38 manylinux2014_x86_64 cuda11.8 cuda11.8_dist +``` + +And the wheel file will be found in the `builder/manywheel/cuda11.8_dist` directory. + +You can use `pip install` to install the wheel file that matches the Python version on your host machine. -- build and install lmdeploy: +## Build in localhost (optional) +Firstly, please make sure gcc version is no less than 9, which can be conformed by `gcc --version`. + +Then, follow the steps below to set up the compilation environment: + +- install the dependent packages: + ```shell + pip install -r requirements.txt + apt-get install rapidjson-dev + ``` +- install [nccl](https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html), and set environment variables: + ```shell + export NCCL_ROOT_DIR=/path/to/nccl/build + export NCCL_LIBRARIES=/path/to/nccl/build/lib + ``` +- install openmpi from source: + ```shell + wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz + tar xf openmpi-4.1.5.tar.gz + cd openmpi-4.1.5 + ./configure + make -j$(nproc) && make install + ``` +- build and install lmdeploy libraries: ```shell + cd lmdeploy # the home folder of lmdeploy mkdir build && cd build sh ../generate.sh make -j$(nproc) && make install ``` +- install lmdeploy python package: + ```shell + cd .. + pip install -e . + ``` diff --git a/docs/en/kv_int8.md b/docs/en/kv_int8.md index 1f5f5aa125..5dcf43ba68 100644 --- a/docs/en/kv_int8.md +++ b/docs/en/kv_int8.md @@ -18,7 +18,7 @@ dequant: f = q * scale + zp Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory. ```bash -python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b +lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b ``` If you already have a workspace directory, skip this step. @@ -29,7 +29,7 @@ Get the quantization parameters by these two steps: ```bash # get minmax -python3 -m lmdeploy.lite.apis.calibrate \ +lmdeploy lite calibrate \ --model $HF_MODEL \ --calib_dataset 'c4' \ # Support c4, ptb, wikitext2, pileval --calib_samples 128 \ # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately @@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \ --work_dir $WORK_DIR \ # Directory for saving quantized statistical parameters and quantized weights in Pytorch format # get quant parameters -python3 -m lmdeploy.lite.apis.kv_qparams \ +lmdeploy lite kv_qparams \ --work_dir $WORK_DIR \ # Directory of the last output --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters --kv_sym False \ # Symmetric or asymmetric quantization, default is False @@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre Test the chat performance. ```bash -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` ## GPU Memory Test diff --git a/docs/en/pytorch.md b/docs/en/pytorch.md index e3662ab373..e4cd5a9cbe 100644 --- a/docs/en/pytorch.md +++ b/docs/en/pytorch.md @@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and **Example 1**: Chat with default setting ```shell -python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL +lmdeploy chat torch $PATH_TO_HF_MODEL ``` **Example 2**: Disable sampling and chat history ```shell -python -m lmdeploy.pytorch.chat \ +lmdeploy chat torch \ $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ --temperature 0 --max-history 0 ``` @@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \ **Example 3**: Accelerate with deepspeed inference ```shell -python -m lmdeploy.pytorch.chat \ +lmdeploy chat torch \ $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ --accel deepspeed ``` diff --git a/docs/en/restful_api.md b/docs/en/restful_api.md index cb70e26375..7f49edce1e 100644 --- a/docs/en/restful_api.md +++ b/docs/en/restful_api.md @@ -3,56 +3,61 @@ ### Launch Service ```shell -python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage. -We provide four restful api in total. Three of them are in OpenAI format. However, we recommend users try -our own api which provides more arguments for users to modify. The performance is comparatively better. +We provide four restful api in total. Three of them are in OpenAI format. + +- /v1/chat/completions +- /v1/models +- /v1/completions + +However, we recommend users try +our own api `/v1/chat/interactive` which provides more arguments for users to modify. The performance is comparatively better. + +**Note** please, if you want to launch multiple requests, you'd better set different `session_id` for both +`/v1/chat/completions` and `/v1/chat/interactive` apis. Or, we will set them random values. ### python -Here is an example for our own api `generate`. +We have integrated the client-side functionalities of these services into the `APIClient` class. Below are some examples demonstrating how to invoke the `api_server` service on the client side. + +If you want to use the `/v1/chat/completions` endpoint, you can try the following code: + +```python +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient('http://{server_ip}:{server_port}') +model_name = api_client.available_models[0] +messages = [{"role": "user", "content": "Say this is a test!"}] +for item in api_client.chat_completions_v1(model=model_name, messages=messages): + print(item) +``` + +For the `/v1/completions` endpoint. If you want to use the `/v1/completions` endpoint, you can try: + +```python +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient('http://{server_ip}:{server_port}') +model_name = api_client.available_models[0] +for item in api_client.completions_v1(model=model_name, prompt='hi'): + print(item) +``` + +Lmdeploy supports maintaining session histories on the server for `/v1/chat/interactive` api. We disable the +feature by default. + +- On interactive mode, the chat history is kept on the server. In a multiple rounds of conversation, you should set + `interactive_mode = True` and the same `session_id` (can't be -1, it's the default number) to `/v1/chat/interactive` for requests. +- On normal mode, no chat history is kept on the server. + +The interactive mode can be controlled by the `interactive_mode` boolean parameter. The following is an example of normal mode. If you want to experience the interactive mode, simply pass in `interactive_mode=True`. ```python -import json -import requests -from typing import Iterable, List - - -def get_streaming_response(prompt: str, - api_url: str, - session_id: int, - request_output_len: int, - stream: bool = True, - sequence_start: bool = True, - sequence_end: bool = True, - ignore_eos: bool = False) -> Iterable[List[str]]: - headers = {'User-Agent': 'Test Client'} - pload = { - 'prompt': prompt, - 'stream': stream, - 'session_id': session_id, - 'request_output_len': request_output_len, - 'sequence_start': sequence_start, - 'sequence_end': sequence_end, - 'ignore_eos': ignore_eos - } - response = requests.post( - api_url, headers=headers, json=pload, stream=stream) - for chunk in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b'\n'): - if chunk: - data = json.loads(chunk.decode('utf-8')) - output = data['text'] - tokens = data['tokens'] - yield output, tokens - - -for output, tokens in get_streaming_response( - "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0, - 512): - print(output, end='') +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient('http://{server_ip}:{server_port}') +for item in api_client.generate(prompt='hi'): + print(item) ``` ### Java/Golang/Rust @@ -84,16 +89,15 @@ List Models: curl http://{server_ip}:{server_port}/v1/models ``` -Generate: +Interactive Chat: ```bash -curl http://{server_ip}:{server_port}/generate \ +curl http://{server_ip}:{server_port}/v1/chat/interactive \ -H "Content-Type: application/json" \ -d '{ "prompt": "Hello! How are you?", "session_id": 1, - "sequence_start": true, - "sequence_end": true + "interactive_mode": true }' ``` @@ -104,19 +108,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "internlm-chat-7b", - "messages": [{"role": "user", "content": "Hello! Ho are you?"}] + "messages": [{"role": "user", "content": "Hello! How are you?"}] }' ``` -Embeddings: +Text Completions: -```bash -curl http://{server_ip}:{server_port}/v1/embeddings \ - -H "Content-Type: application/json" \ +```shell +curl http://{server_ip}:{server_port}/v1/completions \ + -H 'Content-Type: application/json' \ -d '{ - "model": "internlm-chat-7b", - "input": "Hello world!" - }' + "model": "llama", + "prompt": "two steps to build a house:" +}' ``` ### CLI client @@ -125,7 +129,7 @@ There is a client script for restful api server. ```shell # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +lmdeploy serve api_client api_server_url ``` ### webui @@ -133,10 +137,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url You can also test restful-api through webui. ```shell -# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +# api_server_url is what printed in api_server.py, e.g. http://localhost:23333 # server_ip and server_port here are for gradio ui -# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 +lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} ``` ### FAQ @@ -146,10 +150,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True 2. When OOM appeared at the server side, please reduce the number of `instance_num` when lanching the service. -3. When the request with the same `session_id` to `generate` got a empty return value and a negative `tokens`, please consider setting `sequence_start=false` for the second question and the same for the afterwards. - -4. Requests were previously being handled sequentially rather than concurrently. To resolve this issue, - - - kindly provide unique session_id values when calling the `generate` API or else your requests may be associated with client IP addresses +3. When the request with the same `session_id` to `/v1/chat/interactive` got a empty return value and a negative `tokens`, please consider setting `interactive_mode=false` to restart the session. -5. Both `generate` api and `v1/chat/completions` upport engaging in multiple rounds of conversation, where input `prompt` or `messages` consists of either single strings or entire chat histories.These inputs are interpreted using multi-turn dialogue modes. However, ff you want to turn the mode of and manage the chat history in clients, please the parameter `sequence_end: true` when utilizing the `generate` function, or specify `renew_session: true` when making use of `v1/chat/completions` +4. The `/v1/chat/interactive` api disables engaging in multiple rounds of conversation by default. The input argument `prompt` consists of either single strings or entire chat histories. diff --git a/docs/en/serving.md b/docs/en/serving.md index 1e6f783d7a..6cc18018d0 100644 --- a/docs/en/serving.md +++ b/docs/en/serving.md @@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf +lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf bash workspace/service_docker_up.sh ``` @@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2 +lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2 bash workspace/service_docker_up.sh ``` @@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh 70B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8 +lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8 bash workspace/service_docker_up.sh ``` @@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \ +lmdeploy convert llama /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model bash workspace/service_docker_up.sh ``` @@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \ +lmdeploy convert llama /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 bash workspace/service_docker_up.sh ``` @@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh 30B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \ +lmdeploy convert llama /path/to/llama-30b llama \ --tokenizer_path /path/to/tokenizer/model --tp 4 bash workspace/service_docker_up.sh ``` @@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh 65B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \ +lmdeploy convert llama /path/to/llama-65b llama \ --tokenizer_path /path/to/tokenizer/model --tp 8 bash workspace/service_docker_up.sh ``` @@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-7b \ --delta-path lmsys/vicuna-7b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b +lmdeploy convert vicuna /path/to/vicuna-7b bash workspace/service_docker_up.sh ``` @@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-13b \ --delta-path lmsys/vicuna-13b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b +lmdeploy convert vicuna /path/to/vicuna-13b bash workspace/service_docker_up.sh ``` diff --git a/docs/en/supported_models/codellama.md b/docs/en/supported_models/codellama.md index 1b51402056..78f4d2ce5d 100644 --- a/docs/en/supported_models/codellama.md +++ b/docs/en/supported_models/codellama.md @@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu python3 -m pip install lmdeploy # convert weight layout -python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model +lmdeploy convert codellama /the/path/of/codellama/model ``` Then, you can communicate with codellama in consolo by following instructions in next sections @@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in ### Completion ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap completion +lmdeploy chat turbomind ./workspace --cap completion ``` ### Infilling ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling +lmdeploy chat turbomind ./workspace --cap infilling ``` The input code is supposed to have a special placeholder ``. For example, @@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in ` Iterable[List[str]]: - headers = {'User-Agent': 'Test Client'} - pload = { - 'prompt': prompt, - 'stream': stream, - 'session_id': session_id, - 'request_output_len': request_output_len, - 'sequence_start': sequence_start, - 'sequence_end': sequence_end, - 'ignore_eos': ignore_eos - } - response = requests.post( - api_url, headers=headers, json=pload, stream=stream) - for chunk in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b'\n'): - if chunk: - data = json.loads(chunk.decode('utf-8')) - output = data['text'] - tokens = data['tokens'] - yield output, tokens - - -for output, tokens in get_streaming_response( - "Hi, how are you?", "http://{server_ip}:{server_port}/generate", 0, - 512): - print(output, end='') +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient('http://{server_ip}:{server_port}') +model_name = api_client.available_models[0] +for item in api_client.completions_v1(model=model_name, prompt='hi'): + print(item) +``` + +LMDeploy 的 `/v1/chat/interactive` api 支持将对话内容管理在服务端,但是我们默认关闭。如果想尝试,请阅读以下介绍: + +- 交互模式下,对话历史保存在 server。在一次完整的多轮对话中,所有请求设置`interactive_mode = True`, `session_id`保持相同 (不为 -1,这是缺省值)。 +- 非交互模式下,server 不保存历史记录。 + +交互模式可以通过 `interactive_mode` 布尔量参数控制。下面是一个普通模式的例子, +如果要体验交互模式,将 `interactive_mode=True` 传入即可。 + +```python +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient('http://{server_ip}:{server_port}') +for item in api_client.generate(prompt='hi'): + print(item) ``` ### Java/Golang/Rust @@ -86,16 +86,15 @@ cURL 也可以用于查看 API 的输出结果 curl http://{server_ip}:{server_port}/v1/models ``` -使用 generate: +Interactive Chat: ```bash -curl http://{server_ip}:{server_port}/generate \ +curl http://{server_ip}:{server_port}/v1/chat/interactive \ -H "Content-Type: application/json" \ -d '{ "prompt": "Hello! How are you?", "session_id": 1, - "sequence_start": true, - "sequence_end": true + "interactive_mode": true }' ``` @@ -106,19 +105,19 @@ curl http://{server_ip}:{server_port}/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "internlm-chat-7b", - "messages": [{"role": "user", "content": "Hello! Ho are you?"}] + "messages": [{"role": "user", "content": "Hello! How are you?"}] }' ``` -Embeddings: +Text Completions: -```bash -curl http://{server_ip}:{server_port}/v1/embeddings \ - -H "Content-Type: application/json" \ +```shell +curl http://{server_ip}:{server_port}/v1/completions \ + -H 'Content-Type: application/json' \ -d '{ - "model": "internlm-chat-7b", - "input": "Hello world!" - }' + "model": "llama", + "prompt": "two steps to build a house:" +}' ``` ### CLI client @@ -126,8 +125,8 @@ curl http://{server_ip}:{server_port}/v1/embeddings \ restful api 服务可以通过客户端测试,例如 ```shell -# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +# restful_api_url is what printed in api_server.py, e.g. http://localhost:23333 +lmdeploy serve api_client api_server_url ``` ### webui @@ -135,10 +134,10 @@ python -m lmdeploy.serve.openai.api_client restful_api_url 也可以直接用 webui 测试使用 restful-api。 ```shell -# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 -# server_ip 和 server_port 是用来提供 gradio ui 访问服务的 -# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# api_server_url 就是 api_server 产生的,比如 http://localhost:23333 +# server_name 和 server_port 是用来提供 gradio ui 访问服务的 +# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 +lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} ``` ### FAQ @@ -148,12 +147,6 @@ python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True 2. 当服务端显存 OOM 时,可以适当减小启动服务时的 `instance_num` 个数 -3. 当同一个 `session_id` 的请求给 `generate` 函数后,出现返回空字符串和负值的 `tokens`,应该是第二次问话没有设置 `sequence_start=false` - -4. 如果感觉请求不是并发地被处理,而是一个一个地处理,请设置好以下参数: - - - 不同的 session_id 传入 `generate` api。否则,我们将自动绑定会话 id 为请求端的 ip 地址编号。 +3. 当同一个 `session_id` 的请求给 `/v1/chat/interactive` 函数后,出现返回空字符串和负值的 `tokens`,应该是 `session_id` 混乱了,可以先将交互模式关闭,再重新开启。 -5. `generate` api 和 `v1/chat/completions` 均支持多轮对话。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问,也可以是一段对话历史。 - 两个 api 都是默认开启多伦对话的,如果你想关闭这个功能,然后在客户端管理会话记录,请设置 `sequence_end: true` 传入 `generate`,或者设置 - `renew_session: true` 传入 `v1/chat/completions`。 +4. `/v1/chat/interactive` api 支持多轮对话, 但是默认关闭。`messages` 或者 `prompt` 参数既可以是一个简单字符串表示用户的单词提问,也可以是一段对话历史。 diff --git a/docs/zh_cn/serving.md b/docs/zh_cn/serving.md index e0a2f5a986..db4ebb8d3c 100644 --- a/docs/zh_cn/serving.md +++ b/docs/zh_cn/serving.md @@ -8,7 +8,7 @@ 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf +lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf bash workspace/service_docker_up.sh ``` @@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2 +lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2 bash workspace/service_docker_up.sh ``` @@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh 70B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8 +lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8 bash workspace/service_docker_up.sh ``` @@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh 7B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \ +lmdeploy convert llama /path/to/llama-7b llama \ --tokenizer_path /path/to/tokenizer/model bash workspace/service_docker_up.sh ``` @@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh 13B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \ +lmdeploy convert llama /path/to/llama-13b llama \ --tokenizer_path /path/to/tokenizer/model --tp 2 bash workspace/service_docker_up.sh ``` @@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh 30B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \ +lmdeploy convert llama /path/to/llama-30b llama \ --tokenizer_path /path/to/tokenizer/model --tp 4 bash workspace/service_docker_up.sh ``` @@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh 65B ```shell -python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \ +lmdeploy convert llama /path/to/llama-65b llama \ --tokenizer_path /path/to/tokenizer/model --tp 8 bash workspace/service_docker_up.sh ``` @@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-7b \ --delta-path lmsys/vicuna-7b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b +lmdeploy convert vicuna /path/to/vicuna-7b bash workspace/service_docker_up.sh ``` @@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \ --target-model-path /path/to/vicuna-13b \ --delta-path lmsys/vicuna-13b-delta-v1.1 -python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b +lmdeploy convert vicuna /path/to/vicuna-13b bash workspace/service_docker_up.sh ``` diff --git a/docs/zh_cn/supported_models/codellama.md b/docs/zh_cn/supported_models/codellama.md index ca9029a527..017df62b5f 100644 --- a/docs/zh_cn/supported_models/codellama.md +++ b/docs/zh_cn/supported_models/codellama.md @@ -29,7 +29,7 @@ python3 -m pip install lmdeploy # 转模型格式 -python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model +lmdeploy convert codellama /path/of/codellama/model ``` 接下来,可参考如下章节,在控制台与 codellama 进行交互式对话。 @@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model ### 代码续写 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap completion +lmdeploy chat turbomind ./workspace --cap completion ``` ### 代码填空 ```shell -python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling +lmdeploy chat turbomind ./workspace --cap infilling ``` 输入的代码块中要包含 ``,比如: @@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str: ### 对话 ``` -python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python" +lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python" ``` 可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。 @@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid ### Python 专项 ``` -python3 -m lmdeploy.turbomind.chat ./workspace --cap python +lmdeploy chat turbomind ./workspace --cap python ``` 建议这里部署 Python 微调模型 @@ -90,7 +90,7 @@ TBD ```shell # --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数 # --tp: 在 tensor parallel时,使用的GPU数量 -python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1 +lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1 ``` 打开 `http://{server_ip}:{server_port}`,即可访问 swagger,查阅 RESTful API 的详细信息。 @@ -98,17 +98,17 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port -- 你可以用命令行,在控制台与 server 通信: ```shell -# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 -python -m lmdeploy.serve.openai.api_client restful_api_url +# api_server_url 就是 api_server 产生的,比如 http://localhost:23333 +lmdeploy serve api_client api_server_url ``` 或者,启动 gradio,在 webui 的聊天对话框中,与 codellama 交流: ```shell -# restful_api_url 就是 api_server 产生的,比如 http://localhost:23333 +# api_server_url 就是 api_server 产生的,比如 http://localhost:23333 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的 -# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True -python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True +# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 +lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} ``` 关于 RESTful API的详细介绍,请参考[这份](../restful_api.md)文档。 diff --git a/docs/zh_cn/w4a16.md b/docs/zh_cn/w4a16.md index 68cc094df8..e0a220eb60 100644 --- a/docs/zh_cn/w4a16.md +++ b/docs/zh_cn/w4a16.md @@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4 ```shell ## 转换模型的layout,存放在默认路径 ./workspace 下 -python3 -m lmdeploy.serve.turbomind.deploy \ +lmdeploy convert \ --model-name llama2 \ --model-path ./llama2-chat-7b-w4 \ --model-format awq \ --group-size 128 ## 推理 -python3 -m lmdeploy.turbomind.chat ./workspace +lmdeploy chat turbomind ./workspace ``` ## 启动 gradio 服务 @@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace 如果想通过 webui 与模型对话,请执行以下命令启动 gradio 服务 ```shell -python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port} +lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port} ``` 然后,在浏览器中打开 http://{ip_addr}:{port},即可在线对话 @@ -82,7 +82,7 @@ python benchmark/profile_generation.py \ ### 第一步:生成量化参数 ```shell -python3 -m lmdeploy.lite.apis.calibrate \ +lmdeploy lite calibrate \ --model $HF_MODEL \ --calib_dataset 'c4' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval --calib_samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小 @@ -95,7 +95,7 @@ python3 -m lmdeploy.lite.apis.calibrate \ LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时,需要把步骤1的`$WORK_DIR`传入。量化结束后,权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明,进行模型推理。 ```shell -python3 -m lmdeploy.lite.apis.auto_awq \ +lmdeploy lite auto_awq \ --model $HF_MODEL \ --w_bits 4 \ # 权重量化的 bit 数 --w_group_size 128 \ # 权重量化分组统计尺寸 diff --git a/lmdeploy/cli/__init__.py b/lmdeploy/cli/__init__.py new file mode 100644 index 0000000000..3575bec5bd --- /dev/null +++ b/lmdeploy/cli/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .cli import run + +__all__ = ['run'] diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py new file mode 100644 index 0000000000..735b24c7cc --- /dev/null +++ b/lmdeploy/cli/chat.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + + +class SubCliChat(object): + """Chat through terminal with pytorch or turbomind model.""" + + def torch(self, + model_path: str, + tokenizer_path: Optional[str] = None, + accel: Optional[str] = None, + max_new_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + seed: int = 0, + use_fast_tokenizer: bool = True, + max_alloc: int = 2048, + max_session_len: int = None, + log_file: Optional[str] = None, + debug: bool = False, + adapter: Optional[str] = None): + """Chat with pytorch model through terminal. + + Args: + model_path (str): Path to pytorch model. + tokenizer_path (str): Path to tokenizer. + accel (str): Model accelerator. + max_new_tokens (int): Maximum number of tokens to generate. + temperature (float): Temperature for sampling. + top_p (float): Top p for sampling. + seed (int): Random seed. + use_fast_tokenizer (bool): Whether to use fast tokenizer. + This argument is directly pass to transformer's + ``AutoTokenizer.from_pretrained``. + Generally, user should choose to use fast tokenizers. + But if using fast raise some error, try to force using a slow one. + max_alloc (int): Maximum memory to allocate (for deepspeed). + max_session_len (int): Maximum number of tokens allowed for all chat sessions. + This include both history and current session. + log_file (str): Path to log file. + debug (bool): Whether to enable debug mode. + adapter (str): Force to use an adapter. + Generally user should not use this argument because adapter is selected based + on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2 + based on `LlamaforCausalLM` class, this argument is required. + Currently, only "llama1" is acceptable for llama1 models. + """ # noqa: E501 + from lmdeploy.pytorch.chat import main as run_torch_model + + run_torch_model(model_path, + tokenizer_path=tokenizer_path, + accel=accel, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + seed=seed, + use_fast_tokenizer=use_fast_tokenizer, + max_alloc=max_alloc, + max_session_len=max_session_len, + log_file=log_file, + debug=debug, + adapter=adapter) + + def turbomind(self, + model_path, + session_id: int = 1, + cap: str = 'chat', + tp=1, + stream_output=True, + **kwargs): + """Chat with turbomind model through terminal. + + Args: + model_path (str): the path of the deployed model + session_id (int): the identical id of a session + cap (str): the capability of a model. For example, codellama has + the ability among ['completion', 'infilling', 'chat', 'python'] + tp (int): GPU number used in tensor parallelism + stream_output (bool): indicator for streaming output or not + **kwarg (dict): other arguments for initializing model's chat + template + """ + from lmdeploy.turbomind.chat import main as run_turbomind_model + + run_turbomind_model(model_path, + session_id=session_id, + cap=cap, + tp=tp, + stream_output=stream_output, + **kwargs) diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py new file mode 100644 index 0000000000..ab15cb46ad --- /dev/null +++ b/lmdeploy/cli/cli.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os + +import fire + +from .chat import SubCliChat +from .lite import SubCliLite +from .serve import SubCliServe + + +class CLI(object): + """LMDeploy Command Line Interface. + + The CLI provides a unified API for converting, compressing and deploying + large language models. + """ + + def convert(self, + model_name: str, + model_path: str, + model_format: str = None, + tokenizer_path: str = None, + dst_path: str = './workspace', + tp: int = 1, + quant_path: str = None, + group_size: int = 0): + """Convert LLMs to lmdeploy format. + + Args: + model_name (str): The name of the to-be-deployed model, such as + llama-7b, llama-13b, vicuna-7b and etc. + model_path (str): The directory path of the model + model_format (str): the format of the model, should choose from + ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama + format, 'hf' means huggingface llama format, and 'awq' means + llama(hf) model quantized by lmdeploy/lite/quantization/awq.py. + the default value is None, which means the model_format will be + inferred based on model_name + tokenizer_path (str): The path of tokenizer model. + dst_path (str): The destination path that saves outputs. + tp (int): The number of GPUs used for tensor parallelism, which + should be 2^n. + quant_path (str): Path of the quantized model, which can be None. + group_size (int): A parameter used in AWQ to quantize fp16 weights + to 4 bits. + """ + from lmdeploy.turbomind.deploy.converter import main as convert + + convert(model_name, + model_path, + model_format=model_format, + tokenizer_path=tokenizer_path, + dst_path=dst_path, + tp=tp, + quant_path=quant_path, + group_size=group_size) + + def list(self, engine: str = 'turbomind'): + """List supported model names. + + Examples 1: + lmdeploy list + + Examples 2: + lmdeploy list --engine pytorch + + Args: + engine (str): The backend for the model to run. Choice from + ['turbomind', 'pytorch']. + """ + assert engine in ['turbomind', 'pytorch'] + if engine == 'pytorch': + model_names = ['llama', 'llama2', 'internlm-7b'] + elif engine == 'turbomind': + from lmdeploy.model import MODELS + model_names = list(MODELS.module_dict.keys()) + model_names = [n for n in model_names if n.lower() not in ['base']] + model_names.sort() + print('Supported model names:') + print('\n'.join(model_names)) + + def check_env(self, dump_file: str = None): + """Check env information. + + Args: + dump_file (str): Output file to save env info. + """ + + import importlib + + import mmengine + from mmengine.utils import get_git_hash + from mmengine.utils.dl_utils import collect_env + + from lmdeploy.version import __version__ + + env_info = collect_env() + env_info['LMDeploy'] = __version__ + '+' + get_git_hash()[:7] + + # remove some unnecessary info + remove_reqs = ['MMEngine', 'OpenCV'] + for req in remove_reqs: + if req in env_info: + env_info.pop(req) + + # extra important dependencies + extra_reqs = ['transformers', 'gradio', 'fastapi', 'pydantic'] + + for req in extra_reqs: + try: + env_info[req] = importlib.import_module(req).__version__ + except Exception: + env_info[req] = 'Not Found' + + # print env info + for k, v in env_info.items(): + print(f'{k}: {v}') + + # dump to local file + if dump_file is not None: + work_dir, _ = os.path.split(dump_file) + if work_dir: + os.makedirs(work_dir, exist_ok=True) + mmengine.dump(env_info, dump_file) + + +def run(): + """The entry point of running LMDeploy CLI.""" + + cli = CLI() + cli.lite = SubCliLite() + cli.chat = SubCliChat() + cli.serve = SubCliServe() + + fire.Fire(cli, name='lmdeploy') diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py new file mode 100644 index 0000000000..4302765e28 --- /dev/null +++ b/lmdeploy/cli/lite.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. + + +class SubCliLite(object): + """CLI for compressing LLMs.""" + + def auto_awq(self, + model: str, + work_dir: str, + w_bits: int = 4, + w_sym: bool = False, + w_group_size: int = 128, + device: str = 'cuda'): + """Perform weight quantization using AWQ algorithm. + + Args: + model (str): The path of model in hf format. + work_dir (str): The working directory to save results. + w_bits (int): Bit number for weight quantization. + w_sym (bool): Whether to do symmetric quantization. + w_group_size (int): Group size for weight quantization statistics. + device (str): Device type of running. + """ + from lmdeploy.lite.apis.auto_awq import auto_awq + + auto_awq(model, + work_dir, + w_bits=w_bits, + w_sym=w_sym, + w_group_size=w_group_size, + device=device) + + def calibrate(self, + model: str, + calib_dataset: str = 'c4', + calib_samples: int = 128, + calib_seqlen: int = 2048, + work_dir: str = './work_dir', + device: str = 'cuda') -> None: + """Perform calibration on a given dataset. + + Args: + model (str): The model to be loaded. + calib_dataset (str, optional): The calibration dataset name. + Defaults to 'c4'. + calib_samples (int, optional): The number of samples for + calibration. Defaults to 128. + calib_seqlen (int, optional): The sequence length for calibration. + Defaults to 2048. + work_dir (str): The working directory for outputs. + Defaults to './work_dir'. + device (str, optional): The device to be used for calculation. + Defaults to 'cuda'. + """ + from lmdeploy.lite.apis.calibrate import calibrate + + calibrate(model, + calib_dataset=calib_dataset, + calib_samples=calib_samples, + calib_seqlen=calib_seqlen, + work_dir=work_dir, + device=device) + + def kv_qparams(self, + work_dir: str, + turbomind_dir: str, + kv_bits: int = 8, + kv_sym: bool = False, + num_tp: int = 1) -> None: + """Export key and value stats. + + Args: + work_dir (str): Directory path where the stats + are saved. + turbomind_dir (str): Directory path where to + save the results. + kv_bits (int, optional): Number of bits for quantization. + Defaults to 8. + kv_sym (bool, optional): Whether to use symmetric quantization. + Defaults to False. + num_tp (int, optional): Number of tensor parallelism. + Defaults to 1. + """ + from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams + + run_kv_qparams(work_dir, + turbomind_dir, + kv_bits=kv_bits, + kv_sym=kv_sym, + num_tp=num_tp) + + def get_small_sharded_hf(self, src_dir: str, dst_dir: str): + """Convert a hugging face model to the smallest sharded one. + + Args: + src_dir (str): The directory of the input HF model. + dst_dir (str): The directory to save new model. + """ + from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded + run_sharded(src_dir, dst_dir) diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py new file mode 100644 index 0000000000..33580cdfe1 --- /dev/null +++ b/lmdeploy/cli/serve.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + + +class SubCliServe(object): + """Serve LLMs and interact on terminal or web UI.""" + + def gradio(self, + model_path_or_server: str, + server_name: str = '0.0.0.0', + server_port: int = 6006, + batch_size: int = 32, + tp: int = 1, + restful_api: bool = False): + """Serve LLMs with web ui using gradio. + + Example 1: + lmdeploy serve gradio ./workspace + + Example 2: + lmdeploy serve gradio http://0.0.0.0:23333 + --server_name 0.0.0.0 + --server_port 6006 + --restful_api True + + Example 3: + lmdeploy serve gradio ${triton_server_ip_addresss}:33337 + + Args: + model_path_or_server (str): the path of the deployed model or the + tritonserver URL or restful api URL. The former is for directly + running service with gradio. The latter is for running with + tritonserver by default. If the input URL is restful api. + Please enable another flag `restful_api`. + server_name (str): the ip address of gradio server + server_port (int): the port of gradio server + batch_size (int): batch size for running Turbomind directly + tp (int): tensor parallel for Turbomind + restful_api (bool): a flag for model_path_or_server + """ + from lmdeploy.serve.gradio.app import run + run(model_path_or_server, + server_name=server_name, + server_port=server_port, + batch_size=batch_size, + tp=tp, + restful_api=restful_api) + + def api_server(self, + model_path: str, + server_name: str = '0.0.0.0', + server_port: int = 23333, + instance_num: int = 32, + tp: int = 1, + allow_origins: List[str] = ['*'], + allow_credentials: bool = True, + allow_methods: List[str] = ['*'], + allow_headers: List[str] = ['*']): + """Serve LLMs with restful api using fastapi. + + Args: + model_path (str): the path of the deployed model + server_name (str): host ip for serving + server_port (int): server port + instance_num (int): number of instances of turbomind model + tp (int): tensor parallel + allow_origins (List[str]): a list of allowed origins for CORS + allow_credentials (bool): whether to allow credentials for CORS + allow_methods (List[str]): a list of allowed HTTP methods for CORS + allow_headers (List[str]): a list of allowed HTTP headers for CORS + """ + from lmdeploy.serve.openai.api_server import main as run_api_server + + run_api_server(model_path, + server_name=server_name, + server_port=server_port, + instance_num=instance_num, + tp=tp, + allow_origins=allow_origins, + allow_credentials=allow_credentials, + allow_methods=allow_methods, + allow_headers=allow_headers) + + def api_client(self, restful_api_url: str, session_id: int = 0): + """Interact with restful api server in terminal. + + Args: + restful_api_url: The restful api URL. + session_id: The identical id of a session. + """ + from lmdeploy.serve.openai.api_client import main as run_api_client + run_api_client(restful_api_url, session_id=session_id) + + def triton_client(self, + tritonserver_addr: str, + session_id: int = 1, + cap: str = 'chat', + stream_output: bool = True, + **kwargs): + """Interact with Triton Server using gRPC protocol. + + Args: + tritonserver_addr (str): the address in format "ip:port" of + triton inference server + session_id (int): the identical id of a session + cap (str): the capability of a model. For example, codellama + has the ability among ['completion', 'infill', 'instruct', + 'python'] + stream_output (bool): indicator for streaming output or not + **kwargs (dict): other arguments for initializing model's + chat template + """ + + from lmdeploy.serve.client import main as run_triton_client + + run_triton_client( + tritonserver_addr, + session_id=session_id, + cap=cap, + stream_output=stream_output, + **kwargs, + ) diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py index 3517f51b85..38f067b563 100644 --- a/lmdeploy/lite/apis/auto_awq.py +++ b/lmdeploy/lite/apis/auto_awq.py @@ -2,7 +2,6 @@ from pathlib import Path -import fire import torch from accelerate import (infer_auto_device_map, init_empty_weights, load_checkpoint_in_model) @@ -16,13 +15,15 @@ LAYER_TYPE_MAP = { 'InternLMForCausalLM': 'InternLMDecoderLayer', 'QWenLMHeadModel': 'QWenBlock', - 'BaiChuanForCausalLM': 'DecoderLayer', + 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B + 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B 'LlamaForCausalLM': 'LlamaDecoderLayer', } NORM_TYPE_MAP = { 'InternLMForCausalLM': 'InternLMRMSNorm', 'QWenLMHeadModel': 'RMSNorm', - 'BaiChuanForCausalLM': 'RMSNorm', + 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B + 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B 'LlamaForCausalLM': 'LlamaRMSNorm', } @@ -41,6 +42,9 @@ def auto_awq(model: str, hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True) checkpoint = hf_config._name_or_path + # hard code for qwen, other configs do not have the `fp16` attribute. + hf_config.fp16 = True + with init_empty_weights(): # Load model model = AutoModelForCausalLM.from_pretrained(model, @@ -62,11 +66,14 @@ def auto_awq(model: str, device_map[name] = 'cpu' else: device_map[name] = 0 - load_checkpoint_in_model(model, checkpoint, device_map) + load_checkpoint_in_model(model, + checkpoint, + device_map, + dtype=torch.float16) work_dir = Path(work_dir) - act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean'] + act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax'] layers = collect_target_modules(model, layer_type) fcs = {} for l_name, layer in layers.items(): @@ -81,5 +88,6 @@ def auto_awq(model: str, if __name__ == '__main__': + import fire fire.Fire(auto_awq) diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py index 38b6429a19..27d631bdad 100644 --- a/lmdeploy/lite/apis/calibrate.py +++ b/lmdeploy/lite/apis/calibrate.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. from pathlib import Path +from typing import Union -import fire import torch from accelerate import (infer_auto_device_map, init_empty_weights, load_checkpoint_in_model) +from torch import nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from lmdeploy.lite.quantization import CalibrationContext @@ -14,17 +15,90 @@ LAYER_TYPE_MAP = { 'InternLMForCausalLM': 'InternLMDecoderLayer', 'QWenLMHeadModel': 'QWenBlock', - 'BaiChuanForCausalLM': 'DecoderLayer', + 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B + 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B 'LlamaForCausalLM': 'LlamaDecoderLayer', } NORM_TYPE_MAP = { 'InternLMForCausalLM': 'InternLMRMSNorm', 'QWenLMHeadModel': 'RMSNorm', - 'BaiChuanForCausalLM': 'RMSNorm', + 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B + 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B 'LlamaForCausalLM': 'LlamaRMSNorm', } +def _prepare_for_calibrate(model: nn.Module, + layer_type: Union[str, type], + head_name: str = 'lm_head', + device: str = 'cuda', + prefix: str = '') -> None: + """Prepare the model for calibration by moving specific modules to CPU. + + This function goes through each child of a given model and checks whether + it is an instance of a certain layer type or has the name equal to + `head_name`. + If yes, it moves the module to CPU, otherwise to the specified device + (default is CUDA). + + If the child contains the target layer type in its sub-modules, the + function performs the same operation recursively. + + Parameters + ---------- + model : nn.Module + The PyTorch model to prepare for calibration. + layer_type : Union[str, Type] + The type of the layer to be moved to CPU. Can be either a string of + class name or the class type itself. + head_name : str, optional + The name of the module to be moved to CPU. Default is 'lm_head'. + device : str, optional + The device to which modules not matching the `layer_type` or + `head_name` will be moved. Default is 'cuda'. + prefix : str, optional + The prefix used when printing the names of the moved modules. + Default is ''. + + Raises + ------ + TypeError + If `layer_type` is neither a string nor a type. + """ + + for name, child in model.named_children(): + + # Check if the child is an instance of the given layer type + if isinstance(layer_type, str): + is_layer = type(child).__name__ == layer_type + elif isinstance(layer_type, type): + is_layer = isinstance(child, layer_type) + else: + raise TypeError( + 'layer_type should be a string (class name) or a type') + + # Check if the child contains the target module type + contain_layer = len( + collect_target_modules(child, layer_type, [head_name]).keys()) > 0 + + # Check if the child matches the head name + is_head = name == head_name + + mod_name = f'{prefix}.{name}' if prefix else name + + # If the child is either an instance of the layer type or has the + # head name, move it to CPU, otherwise move it to the specified device + if is_layer or is_head: + child.to('cpu') + print(f'Move {mod_name} to CPU.') + elif contain_layer: + _prepare_for_calibrate(child, layer_type, head_name, device, + mod_name) + else: + child.to(device) + print(f'Move {mod_name} to GPU.') + + def calibrate(model: str, calib_dataset: str = 'c4', calib_samples: int = 128, @@ -55,16 +129,38 @@ def calibrate(model: str, tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, trust_remote_code=True) - hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True) + hf_config = AutoConfig.from_pretrained(model, + torch_dtype=torch.float16, + trust_remote_code=True) checkpoint = hf_config._name_or_path + # hard code for qwen, other configs do not have the `fp16` attribute. + hf_config.fp16 = True + with init_empty_weights(): # Load model model = AutoModelForCausalLM.from_pretrained(model, + config=hf_config, torch_dtype=torch.float16, trust_remote_code=True) model.config.use_cache = False + model_type = type(model).__name__ + if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP: + raise RuntimeError( + f'Currently, quantification and calibration of {model_type} are ' + f'not supported. The supported model types are ' + f"{', '.join(LAYER_TYPE_MAP.keys())}.") + + if model_type == 'QWenLMHeadModel': + try: + import flash_attn # noqa: F401 + except ImportError: + raise RuntimeError( + 'When using Qwen, you need to `pip install flash-attn` first, ' + 'otherwise calibration and quantification will not work ' + 'properly.') + layer_type = LAYER_TYPE_MAP[type(model).__name__] norm_type = NORM_TYPE_MAP[type(model).__name__] @@ -78,7 +174,12 @@ def calibrate(model: str, device_map[name] = 'cpu' else: device_map[name] = 0 - load_checkpoint_in_model(model, checkpoint, device_map) + load_checkpoint_in_model(model, + checkpoint, + device_map, + dtype=torch.float16) + + _prepare_for_calibrate(model, layer_type, 'lm_head', device) print('Loading calibrate dataset ...') calib_loader, _ = get_calib_loaders(calib_dataset, @@ -107,4 +208,6 @@ def calibrate(model: str, if __name__ == '__main__': + import fire + fire.Fire(calibrate) diff --git a/lmdeploy/lite/apis/kv_qparams.py b/lmdeploy/lite/apis/kv_qparams.py index 7d43078daf..f31fee0299 100644 --- a/lmdeploy/lite/apis/kv_qparams.py +++ b/lmdeploy/lite/apis/kv_qparams.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Union -import fire import numpy as np import torch @@ -120,5 +119,6 @@ def main(work_dir: str, if __name__ == '__main__': + import fire fire.Fire(main) diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py index c9811563fd..4dca8b1469 100644 --- a/lmdeploy/lite/quantization/awq.py +++ b/lmdeploy/lite/quantization/awq.py @@ -18,6 +18,10 @@ 'QWenBlock': { 'ln_1': ['attn.c_attn'], 'ln_2': ['mlp.w1', 'mlp.w2'] + }, + 'DecoderLayer': { + 'input_layernorm': ['self_attn.W_pack'], + 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj'] } } @@ -33,6 +37,10 @@ 'QWenBlock': { 'attn.c_attn': ['attn.c_proj'], 'mlp.w1': ['mlp.c_proj'] + }, + 'DecoderLayer': { + 'self_attn.W_pack': ['self_attn.o_proj'], + 'mlp.up_proj': ['mlp.down_proj'] } } @@ -69,7 +77,7 @@ def smooth_ln_fcs(ln: torch.nn.Module, w_scales = get_weight_scale(concat_w, group_size) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) + w_scales.pow(1 - alpha)).to(device).to(dtype) scales = scales / (scales.max() * scales.min()).sqrt() ln.weight.div_(scales) @@ -116,10 +124,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module, w_scales = get_weight_scale(concat_w, group_size) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) + w_scales.pow(1 - alpha)).to(device).to(dtype) scales = scales / (scales.max() * scales.min()).sqrt() - # (for qwen) pre_fc is packed QKV, only V needs to scale + # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale if size_pre_fc > size_a and size_pre_fc % size_a == 0 \ and size_pre_fc // size_a == 3: diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py index 56cfda8f01..1d01696eb9 100644 --- a/lmdeploy/lite/quantization/weight/quantizer.py +++ b/lmdeploy/lite/quantization/weight/quantizer.py @@ -8,7 +8,7 @@ cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax, - cal_qparams_per_tensor_minmax) + cal_qparams_per_tensor_minmax, precise_round) from lmdeploy.lite.utils.global_avail import GlobalAvailMixin @@ -119,8 +119,10 @@ def quant(self, torch.Tensor: The fake quantized weight tensor. """ + float_w = weight.float() + if qparams is None: - qparams = self.calculate_qparams(weight) + qparams = self.calculate_qparams(float_w) scales = qparams.scales zero_points = qparams.zero_points @@ -133,17 +135,18 @@ def quant(self, # per group scales shape: [out_c, in_c//group_size, 1] if len(scales.shape) > 2: # scales shape: [out_c, in_c//group_size, 1] - weight = weight.reshape(out_c, scales.shape[1], -1) + float_w = float_w.reshape(out_c, scales.shape[1], -1) if zero_points is None: assert self.symmetry - real_qweight = (weight / scales).round() + real_qweight = (float_w / scales).round() fake_qweight = real_qweight * scales else: assert not self.symmetry - real_qweight = (weight / scales).round() + zero_points + real_qweight = precise_round( + (float_w - float_w.min(-1, keepdim=True)[0]) / scales) fake_qweight = (real_qweight - zero_points) * scales if len(scales.shape) > 2: @@ -153,4 +156,4 @@ def quant(self, if real: return real_qweight.to(torch.int32) else: - return fake_qweight + return fake_qweight.to(weight.dtype) diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py index c2b56287bd..2561fdb23f 100644 --- a/lmdeploy/lite/utils/__init__.py +++ b/lmdeploy/lite/utils/__init__.py @@ -6,7 +6,7 @@ cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax, - cal_qparams_per_tensor_minmax) + cal_qparams_per_tensor_minmax, precise_round) from .calib_dataloader import get_calib_loaders from .collect import (bimap_name_mod, collect_target_modules, collect_target_weights) @@ -16,7 +16,7 @@ 'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax', 'cal_qparams_per_group_absmax', 'cal_qparams_per_group_minmax', 'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax', - 'QParams', 'get_calib_loaders', 'collect_target_modules', + 'QParams', 'get_calib_loaders', 'collect_target_modules', 'precise_round', 'collect_target_weights', 'GlobalAvailMixin', 'split_decoder_layer_inputs', 'bimap_name_mod', 'concat_decoder_layer_outputs' ] diff --git a/lmdeploy/lite/utils/cal_qparams.py b/lmdeploy/lite/utils/cal_qparams.py index a682704a55..569297cdb5 100644 --- a/lmdeploy/lite/utils/cal_qparams.py +++ b/lmdeploy/lite/utils/cal_qparams.py @@ -11,16 +11,22 @@ class QParams(NamedTuple): zero_points: Optional[torch.Tensor] +@torch.no_grad() +def precise_round(x): + return x.sign() * (x.abs() + 0.5).floor() + + @torch.no_grad() def cal_qparams_per_channel_absmax(w: torch.Tensor, n_bits: int, return_stats: bool = False) -> QParams: """Calculate quantization parameters for each channel using absolute max value.""" + float_w = w.float() - absmax = w.abs().max(dim=-1, keepdim=True)[0] + absmax = float_w.abs().max(dim=-1, keepdim=True)[0] q_max = 2**(n_bits - 1) - 1 - scales = absmax.clamp(min=1e-5).div(q_max) + scales = absmax.div(q_max) if return_stats: return QParams(scales=scales, zero_points=None), absmax @@ -35,14 +41,16 @@ def cal_qparams_per_channel_minmax(w: torch.Tensor, """Calculate quantization parameters for each channel using min and max values.""" - w_min = w.min(dim=-1, keepdim=True)[0] - w_max = w.max(dim=-1, keepdim=True)[0] + float_w = w.float() + + w_min = float_w.min(dim=-1, keepdim=True)[0] + w_max = float_w.max(dim=-1, keepdim=True)[0] q_max = 2**n_bits - 1 scales = (w_max - w_min) - scales = scales.clamp_(min=1e-5).div_(q_max) + scales = scales.div_(q_max) - zero_points = (-w_min / scales).round() + zero_points = precise_round(-w_min / scales) if return_stats: return QParams(scales=scales, zero_points=zero_points), (w_min, w_max) @@ -63,9 +71,12 @@ def cal_qparams_per_group_absmax(w: torch.Tensor, 'Input channels should be greater than or equal to group_size.' assert inc % group_size == 0, \ 'Input channels should be divisible by group_size.' - absmax = w.abs().reshape(outc, -1, group_size).max(dim=-1, keepdim=True)[0] + + float_w = w.float() + absmax = float_w.abs().reshape(outc, -1, group_size).max(dim=-1, + keepdim=True)[0] q_max = 2**(n_bits - 1) - 1 - scales = absmax.clamp(min=1e-5).div(q_max) + scales = absmax.div(q_max) if return_stats: return QParams(scales=scales, zero_points=None), absmax else: @@ -85,14 +96,16 @@ def cal_qparams_per_group_minmax(w: torch.Tensor, 'Input channels should be greater than or equal to group_size.' assert inc % group_size == 0, \ 'Input channels should be divisible by group_size.' - w_group_wise = w.reshape(outc, -1, group_size) + + float_w = w.float() + w_group_wise = float_w.reshape(outc, -1, group_size) w_min = w_group_wise.min(dim=-1, keepdim=True)[0] w_max = w_group_wise.max(dim=-1, keepdim=True)[0] q_max = 2**n_bits - 1 scales = (w_max - w_min) - scales = scales.clamp_(min=1e-5).div_(q_max) - zero_points = (-w_min / scales).round() + scales = scales.div_(q_max) + zero_points = precise_round(-w_min / scales) if return_stats: return QParams(scales=scales, zero_points=zero_points), (w_min, w_max) else: @@ -106,13 +119,15 @@ def cal_qparams_per_tensor_minmax(w: torch.Tensor, """Calculate quantization parameters for the entire tensor using min and max values.""" - w_min = w.min() - w_max = w.max() + float_w = w.float() + + w_min = float_w.min() + w_max = float_w.max() q_max = 2**n_bits - 1 scales = (w_max - w_min) scales = scales.clamp_(min=1e-5).div_(q_max) - zero_points = (-w_min / scales).round() + zero_points = precise_round(-w_min / scales) if return_stats: return QParams(scales=scales, zero_points=zero_points), (w_min, w_max) else: @@ -125,9 +140,10 @@ def cal_qparams_per_tensor_absmax(w: torch.Tensor, return_stats: bool = False) -> QParams: """Calculate quantization parameters for the entire tensor using absolute max value.""" - absmax = w.abs().max() + float_w = w.float() + absmax = float_w.abs().max() q_max = 2**(n_bits - 1) - 1 - scales = absmax.clamp(min=1e-5).div(q_max) + scales = absmax.div(q_max) if return_stats: return QParams(scales=scales, zero_points=None), absmax diff --git a/lmdeploy/lite/utils/collect.py b/lmdeploy/lite/utils/collect.py index 8b2691a4a6..3b66ef6146 100644 --- a/lmdeploy/lite/utils/collect.py +++ b/lmdeploy/lite/utils/collect.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Dict, List, Tuple, Union -from mmengine.config.lazy import LazyAttr from torch import nn @@ -22,9 +21,6 @@ def collect_target_modules(model: nn.Module, A dictionary mapping from module names to module instances. """ - if isinstance(target, LazyAttr): - target = target.build() - if not isinstance(target, (type, str)): raise TypeError('Target must be a string (name of the module) ' 'or a type (class of the module)') diff --git a/lmdeploy/model.py b/lmdeploy/model.py index b3fc86f999..81b8229f6a 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -115,6 +115,7 @@ def update_input_ids(self, input_ids: List[int]): return input_ids +@MODELS.register_module(name='wizardlM') @MODELS.register_module(name='vicuna') class Vicuna(BaseModel): """Chat template of vicuna model.""" @@ -177,15 +178,16 @@ class InternLMChat7B(BaseModel): def __init__( self, - system='<|System|>', + system='<|System|>:', meta_instruction="""You are an AI assistant whose name is InternLM (书生·浦语). - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. """, # noqa: E501 - user='<|User|>', - eoh='', - eoa='', - assistant='<|Bot|>', + user='<|User|>:', + eoh='\n', + eoa='\n', + eosys='\n', + assistant='<|Bot|>:', stop_words=[''], **kwargs): super().__init__(**kwargs) @@ -194,6 +196,7 @@ def __init__( self.user = user self.eoh = eoh self.eoa = eoa + self.eosys = eosys self.assistant = assistant self.stop_words = stop_words @@ -211,12 +214,12 @@ def decorate_prompt(self, prompt, sequence_start=True): assert self.capability == 'chat', \ f'{type(self).__name__} has no capability of {self.capability}' if sequence_start: - return f'{self.system}:{self.meta_instruction}\n' \ - f'{self.user}:{prompt}{self.eoh}\n' \ - f'{self.assistant}:' + return f'{self.system}{self.meta_instruction}{self.eosys}' \ + f'{self.user}{prompt}{self.eoh}' \ + f'{self.assistant}' else: - return f'\n{self.user}:{prompt}{self.eoh}\n' \ - f'{self.assistant}:' + return f'\n{self.user}{prompt}{self.eoh}' \ + f'{self.assistant}' def messages2prompt(self, messages, sequence_start=True): """Return the prompt that is concatenated with other elements in the @@ -227,17 +230,19 @@ def messages2prompt(self, messages, sequence_start=True): Returns: str: the concatenated prompt """ + if isinstance(messages, str): return self.get_prompt(messages, sequence_start) - system, users, assistants = self._translate_messages(messages) - system = self.meta_instruction if not system else system - ret = f'{self.system}:{system}\n' - for user, assistant in zip(users, assistants): - if assistant: - ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \ - f'{assistant}{self.eoa}\n' - else: - ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' + eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys) + ret = '' + if self.meta_instruction: + ret += f'{self.system}:{self.meta_instruction}{self.eosys}' + + for message in messages: + role = message['role'] + content = message['content'] + ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}' + ret += f'{self.assistant}:' return ret @@ -386,15 +391,16 @@ def messages2prompt(self, messages, sequence_start=True): """ if isinstance(messages, str): return self.get_prompt(messages, sequence_start) - system, users, assistants = self._translate_messages(messages) - system = self.system if not system else system - ret = f'{system}{self.meta_instruction}{self.eosys}' - for user, assistant in zip(users, assistants): - if assistant: - ret += f'{self.user}{user}{self.eoh}{self.assistant}' \ - f'{assistant}{self.eoa}' - else: - ret += f'{self.user}{user}{self.eoh}{self.assistant}' + eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys) + ret = '' + if self.meta_instruction: + ret += f'{self.system}{self.meta_instruction}{self.eosys}' + + for message in messages: + role = message['role'] + content = message['content'] + ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}' + ret += f'{self.assistant}' return ret @@ -625,6 +631,141 @@ def update_input_ids(self, input_ids: List): return input_ids +@MODELS.register_module(name='solar') +class SOLAR(BaseModel): + """Chat template of SOLAR model. + + `https://huggingface.co/upstage/SOLAR-0-70b-16bit` + """ + + def __init__(self, + b_sys='### System:\n', + e_sys='\n\n', + user='### User:\n', + eoh='\n\n', + assistant='### Assistant:\n', + eoa='\n\n', + system='', + session_len=2048, + **kwargs): + super().__init__(**kwargs) + self.b_sys = b_sys + self.e_sys = e_sys + self.user = user + self.eoh = eoh + self.assistant = assistant + self.eoa = eoa + self.system = system + self.session_len = session_len + + def decorate_prompt(self, prompt, sequence_start=True): + """Return the prompt that is concatenated with other elements in the + chat template. + + Args: + prompt (str): user's input prompt + sequence_start (bool): indicator for the first round chat of a + session sequence + Returns: + str: the concatenated prompt + """ + assert self.capability == 'chat', \ + f'{type(self).__name__} has no capability of {self.capability}' + if sequence_start: + return f'{self.b_sys}{self.system}{self.e_sys}' \ + f'{self.user}{prompt}{self.eoh}{self.assistant}' + + return f'{self.user}{prompt}{self.eoh}{self.assistant}' + + def messages2prompt(self, messages, sequence_start=True): + """Return the prompt that is concatenated with other elements in the + chat template. + + Args: + messages (str | List): user's input prompt + Returns: + str: the concatenated prompt + """ + if isinstance(messages, str): + return self.get_prompt(messages, sequence_start) + system, users, assistants = self._translate_messages(messages) + system = self.system if not system else system + ret = f'{self.b_sys}{system}{self.e_sys}' + for i, (user, assistant) in enumerate(zip(users, assistants)): + ret += f'{self.user}{user}{self.eoh}{self.assistant}' + if assistant: + ret += f'{assistant}{self.eoa}' + return ret + + +@MODELS.register_module(name='ultracm') +@MODELS.register_module(name='ultralm') +class UltraChat(BaseModel): + """Template of UltraCM and UltraLM models. + + `https://huggingface.co/openbmb/UltraCM-13b` + `https://huggingface.co/openbmb/UltraLM-13b` + """ + + def __init__( + self, + system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""", # noqa: E501 + eos='', + user='User: ', + assistant='Assistant: ', + session_len=2048, + **kwargs): + super().__init__(**kwargs) + self.system = system + self.eos = eos + self.session_len = session_len + self.user = user + self.assistant = assistant + + def decorate_prompt(self, prompt, sequence_start=True): + """Return the prompt that is concatenated with other elements in the + chat template. + + Args: + prompt (str): the input prompt + sequence_start (bool): indicator for the first round chat of a + session sequence + Returns: + str: the concatenated prompt + """ + assert self.capability == 'chat', \ + f'{type(self).__name__} has no capability of {self.capability}' + if sequence_start: + return f'{self.system}\n{self.user}{prompt}{self.eos}' \ + f'\n{self.assistant}' + + return f'\n{self.user}{prompt}{self.eos}' \ + f'\n{self.assistant}' + + def messages2prompt(self, messages, sequence_start=True): + """Return the prompt that is concatenated with other elements in the + chat template. Only evaluate the last instruction completion pair. + + Args: + messages (str | List): user's input prompt + Returns: + str: the concatenated prompt + """ + if isinstance(messages, str): + return self.get_prompt(messages, sequence_start) + system, users, assistants = self._translate_messages(messages) + system = self.system if not system else system + ret = f'{system}' + for user, assistant in zip(users, assistants): + if assistant: + ret += f'\n{self.user}{user}{self.eos}' \ + f'\n{self.assistant}{assistant}{self.eos}' + else: + ret += f'\n{self.user}{user}{self.eos}' \ + f'\n{self.assistant}' + return ret + + def main(model_name: str = 'test'): assert model_name in MODELS.module_dict.keys(), \ f"'{model_name}' is not supported. " \ @@ -637,4 +778,5 @@ def main(model_name: str = 'test'): if __name__ == '__main__': import fire + fire.Fire(main) diff --git a/lmdeploy/pytorch/chat.py b/lmdeploy/pytorch/chat.py index c30cf6ffe9..2690480a8c 100644 --- a/lmdeploy/pytorch/chat.py +++ b/lmdeploy/pytorch/chat.py @@ -51,7 +51,6 @@ import logging from typing import Optional -import fire import torch from transformers import GenerationConfig, PreTrainedModel @@ -205,6 +204,8 @@ def main( def cli(): + import fire + fire.Fire(main) diff --git a/lmdeploy/pytorch/modules/linear.py b/lmdeploy/pytorch/modules/linear.py index bfde0d3d42..218a36407e 100644 --- a/lmdeploy/pytorch/modules/linear.py +++ b/lmdeploy/pytorch/modules/linear.py @@ -4,6 +4,11 @@ import torch from torch import nn +try: + import awq_inference_engine +except ModuleNotFoundError: + awq_inference_engine = None + class WeightOnlyQLinear(nn.Module): """This class implements weight only quantization linear. @@ -18,13 +23,15 @@ class WeightOnlyQLinear(nn.Module): bias (Tensor, optional): Defaults to None. """ - def __init__(self, - w_bit: int, - symmetry: bool, - group_size: int, - in_features: int, - out_features: int, - bias: Optional[torch.Tensor] = None) -> None: + def __init__( + self, + in_features: int, + out_features: int, + bias: Optional[torch.Tensor] = True, + w_bit: int = 4, + symmetry: bool = False, + group_size: int = 128, + ) -> None: super().__init__() if w_bit not in [2, 4, 8]: @@ -92,8 +99,8 @@ def from_linear(cls: Type['WeightOnlyQLinear'], out_features = linear.out_features bias = False if linear.bias is None else True - qlinear = cls(w_bit, symmetry, group_size, in_features, out_features, - bias) + qlinear = cls(in_features, out_features, bias, w_bit, symmetry, + group_size) qlinear.bias = linear.bias qparams = quantizer.calculate_qparams(linear.weight) @@ -124,3 +131,24 @@ def from_linear(cls: Type['WeightOnlyQLinear'], qlinear.to('cpu') return qlinear + + @torch.no_grad() + def forward(self, x): + if awq_inference_engine is None: + raise RuntimeError( + 'Run the following command to install ' + 'the kernel for 4bit inference\n\n' + 'git clone https://github.com/mit-han-lab/llm-awq.git\n' + 'cd awq/kernels\n' + 'python setup.py install\n') + out_shape = x.shape[:-1] + (self.out_features, ) + inputs = x.reshape(-1, x.shape[-1]) + + out = awq_inference_engine.gemm_forward_cuda(inputs.half(), + self.qweight, + self.scales.half(), + self.qzeros, + self.group_size) + out = out + self.bias if self.bias is not None else out + + return out.reshape(out_shape) diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py index 9588b00da1..5abae0d97a 100644 --- a/lmdeploy/serve/async_engine.py +++ b/lmdeploy/serve/async_engine.py @@ -4,9 +4,7 @@ import os.path as osp import random from contextlib import contextmanager -from typing import Literal, Optional - -from lmdeploy.model import MODELS, BaseModel +from typing import List, Literal, Optional @dataclasses.dataclass @@ -28,7 +26,7 @@ class AsyncEngine: tp (int): tensor parallel """ - def __init__(self, model_path, instance_num=32, tp=1) -> None: + def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None: from lmdeploy import turbomind as tm from lmdeploy.tokenizer import Tokenizer tokenizer_model_path = osp.join(model_path, 'triton_models', @@ -36,18 +34,21 @@ def __init__(self, model_path, instance_num=32, tp=1) -> None: tokenizer = Tokenizer(tokenizer_model_path) self.tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, - tp=tp) + tp=tp, + **kwargs) self.tokenizer = tokenizer self.generators = [ self.tm_model.create_instance() for i in range(instance_num) ] self.instance_num = instance_num - self.model: BaseModel = MODELS.get(self.tm_model.model_name)() + self.model = self.tm_model.model self.available = [True] * instance_num self.starts = [None] * instance_num self.steps = {} + self.loop = asyncio.get_event_loop() def stop_session(self, session_id: int): + """Stop a session by a session_id.""" instance_id = session_id % self.instance_num input_ids = self.tokenizer.encode('') for outputs in self.generators[instance_id].stream_infer( @@ -60,8 +61,24 @@ def stop_session(self, session_id: int): pass self.available[instance_id] = True + def end_session(self, session_id: int): + """Clear a session by a session_id.""" + instance_id = session_id % self.instance_num + input_ids = self.tokenizer.encode('') + for outputs in self.generators[instance_id].stream_infer( + session_id, + input_ids, + request_output_len=0, + sequence_start=False, + sequence_end=True, + stop=True): + pass + self.steps[str(session_id)] = 0 + self.available[instance_id] = True + @contextmanager def safe_run(self, instance_id: int, session_id: Optional[int] = None): + """A context manager to make sure server's safe running.""" self.available[instance_id] = False try: yield @@ -82,22 +99,80 @@ async def get_generator(self, instance_id: int, stop: bool = False): await asyncio.sleep(0.1) return self.generators[instance_id] + def batch_infer(self, + prompts: List[str], + request_output_len=512, + top_k=40, + top_p=0.8, + temperature=0.8, + repetition_penalty=1.0, + ignore_eos=False, + do_preprocess=True, + **kwargs): + """Inference a batch of prompts. + + Args: + prompts (List[str]): a batch of prompts + request_output_len (int): output token nums + top_k (int): The number of the highest probability vocabulary + tokens to keep for top-k-filtering + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or higher + are kept for generation. + temperature (float): to modulate the next token probability + repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + ignore_eos (bool): indicator for ignoring eos + do_preprocess (bool): whether pre-process the messages. + """ + assert isinstance(prompts, List), 'prompts should be a list' + batch_size = len(prompts) + outputs = [''] * batch_size + generators = [] + for i, prompt in enumerate(prompts): + generators.append( + self.generate(prompt, + i, + stream_response=True, + sequence_start=True, + sequence_end=True, + request_output_len=request_output_len, + top_k=top_k, + top_p=top_p, + temperature=temperature, + ignore_eos=ignore_eos, + repetition_penalty=repetition_penalty, + do_preprocess=do_preprocess, + **kwargs)) + + async def _inner_call(i, generator): + async for out in generator: + outputs[i] += out.response + + async def gather(): + await asyncio.gather( + *[_inner_call(i, generators[i]) for i in range(batch_size)]) + + self.loop.run_until_complete(gather()) + return outputs + async def generate( - self, - messages, - session_id, - stream_response=True, - sequence_start=True, - sequence_end=False, - step=0, - request_output_len=512, - stop=False, - top_k=40, - top_p=0.8, - temperature=0.8, - repetition_penalty=1.0, - ignore_eos=False, - ): + self, + messages, + session_id, + stream_response=True, + sequence_start=True, + sequence_end=True, # no interactive mode by default + step=0, + request_output_len=512, + stop=False, + top_k=40, + top_p=0.8, + temperature=0.8, + repetition_penalty=1.0, + ignore_eos=False, + do_preprocess=True, + **kwargs): """Generate responses. Args: @@ -109,15 +184,16 @@ async def generate( sequence_end (bool): indicator for ending a sequence step (int): the offset of the k/v cache stop (bool): whether stop inference - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. top_k (int): The number of the highest probability vocabulary tokens to keep for top-k-filtering + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or higher + are kept for generation. temperature (float): to modulate the next token probability repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty ignore_eos (bool): indicator for ignoring eos + do_preprocess (bool): whether pre-process the messages. """ instance_id = session_id % self.instance_num if str(session_id) not in self.steps: @@ -125,14 +201,18 @@ async def generate( if step != 0: self.steps[str(session_id)] = step seed = random.getrandbits(64) - prompt = self.model.messages2prompt(messages, sequence_start) + prompt = messages + if do_preprocess: + prompt = self.model.messages2prompt(prompt, sequence_start) input_ids = self.tokenizer.encode(prompt) finish_reason = 'stop' if stop else None if self.steps[str(session_id)] + len( - input_ids) >= self.tm_model.session_len: + input_ids) + request_output_len >= self.tm_model.session_len: finish_reason = 'length' yield GenOut('', self.steps[str(session_id)], len(input_ids), 0, finish_reason) + if sequence_end is True and sequence_start is False: + self.end_session(session_id) else: generator = await self.get_generator(instance_id, stop) with self.safe_run(instance_id, session_id): @@ -156,6 +236,11 @@ async def generate( # decode res response = self.tokenizer.decode(res.tolist(), offset=response_size) + # utf-8 char at the end means it's a potential unfinished + # byte sequence, continue to concate it with the next + # sequence and decode them together + if response.endswith('�'): + continue # response, history token len, # input token len, gen token len yield GenOut(response, self.steps[str(session_id)], @@ -166,93 +251,3 @@ async def generate( self.steps[str(session_id)] += len(input_ids) + tokens if sequence_end or stop: self.steps[str(session_id)] = 0 - - async def generate_openai( - self, - messages, - instance_id, - stream_response=True, - renew_session=False, - request_output_len=512, - stop=False, - top_k=40, - top_p=0.8, - temperature=0.8, - repetition_penalty=1.0, - ignore_eos=False, - ): - """Generate responses. - - Args: - messages (str | List): chat history or prompt - instance_id (int): actually request host ip - stream_response (bool): whether return responses streamingly - renew_session (bool): renew the session - request_output_len (int): output token nums - stop (bool): whether stop inference - top_p (float): If set to float < 1, only the smallest set of most - probable tokens with probabilities that add up to top_p or higher - are kept for generation. - top_k (int): The number of the highest probability vocabulary - tokens to keep for top-k-filtering - temperature (float): to modulate the next token probability - repetition_penalty (float): The parameter for repetition penalty. - 1.0 means no penalty - ignore_eos (bool): indicator for ignoring eos - """ - session_id = instance_id - instance_id %= self.instance_num - sequence_start = False - generator = await self.get_generator(instance_id) - if renew_session: # renew a session - empty_input_ids = self.tokenizer.encode('') - for outputs in generator.stream_infer(session_id=session_id, - input_ids=[empty_input_ids], - request_output_len=0, - sequence_start=False, - sequence_end=True, - stop=True): - pass - self.steps[str(session_id)] = 0 - if str(session_id) not in self.steps: - self.steps[str(session_id)] = 0 - if self.steps[str(session_id)] == 0: - sequence_start = True - seed = random.getrandbits(64) - prompt = self.model.messages2prompt(messages, sequence_start) - input_ids = self.tokenizer.encode(prompt) - finish_reason = 'stop' if stop else None - if self.steps[str(session_id)] + len( - input_ids) >= self.tm_model.session_len: - finish_reason = 'length' - yield GenOut('', self.steps[str(session_id)], len(input_ids), 0, - finish_reason) - else: - with self.safe_run(instance_id, session_id): - response_size = 0 - async for outputs in generator.async_stream_infer( - session_id=session_id, - input_ids=[input_ids], - stream_output=stream_response, - request_output_len=request_output_len, - sequence_start=(sequence_start), - sequence_end=False, - step=self.steps[str(session_id)], - stop=stop, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - ignore_eos=ignore_eos, - random_seed=seed if sequence_start else None): - res, tokens = outputs[0] - # decode res - response = self.tokenizer.decode(res.tolist(), - offset=response_size) - # response, history len, input len, generation len - yield GenOut(response, self.steps[str(session_id)], - len(input_ids), tokens, finish_reason) - response_size = tokens - - # update step - self.steps[str(session_id)] += len(input_ids) + tokens diff --git a/lmdeploy/serve/client.py b/lmdeploy/serve/client.py index 283e96e299..424e83143f 100644 --- a/lmdeploy/serve/client.py +++ b/lmdeploy/serve/client.py @@ -1,8 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -import fire - from lmdeploy.serve.turbomind.chatbot import Chatbot @@ -20,7 +18,6 @@ def input_prompt(model_name): def main(tritonserver_addr: str, session_id: int = 1, cap: str = 'chat', - sys_instruct: str = None, stream_output: bool = True, **kwargs): """An example to communicate with inference server through the command line @@ -32,13 +29,11 @@ def main(tritonserver_addr: str, session_id (int): the identical id of a session cap (str): the capability of a model. For example, codellama has the ability among ['completion', 'infill', 'instruct', 'python'] - sys_instruct (str): the content of 'system' role, which is used by - conversational model stream_output (bool): indicator for streaming output or not **kwargs (dict): other arguments for initializing model's chat template """ log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING') - kwargs.update(capability=cap, system=sys_instruct) + kwargs.update(capability=cap) chatbot = Chatbot(tritonserver_addr, log_level=log_level, display=stream_output, @@ -69,4 +64,6 @@ def main(tritonserver_addr: str, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/gradio/__init__.py b/lmdeploy/serve/gradio/__init__.py index ef101fec61..770138a44d 100644 --- a/lmdeploy/serve/gradio/__init__.py +++ b/lmdeploy/serve/gradio/__init__.py @@ -1 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .api_server_backend import run_api_server +from .triton_server_backend import run_triton_server +from .turbomind_coupled import run_local + +__all__ = ['run_api_server', 'run_triton_server', 'run_local'] diff --git a/lmdeploy/serve/gradio/api_server_backend.py b/lmdeploy/serve/gradio/api_server_backend.py new file mode 100644 index 0000000000..8dd92fa0fd --- /dev/null +++ b/lmdeploy/serve/gradio/api_server_backend.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time +from threading import Lock +from typing import Sequence + +import gradio as gr + +from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn +from lmdeploy.serve.openai.api_client import (get_model_list, + get_streaming_response) + + +class InterFace: + api_server_url: str = None + global_session_id: int = 0 + lock = Lock() + + +def chat_stream_restful(instruction: str, state_chatbot: Sequence, + cancel_btn: gr.Button, reset_btn: gr.Button, + session_id: int): + """Chat with AI assistant. + + Args: + instruction (str): user's prompt + state_chatbot (Sequence): the chatting history + session_id (int): the session id + """ + state_chatbot = state_chatbot + [(instruction, None)] + + yield (state_chatbot, state_chatbot, disable_btn, enable_btn) + + for response, tokens, finish_reason in get_streaming_response( + instruction, + f'{InterFace.api_server_url}/v1/chat/interactive', + session_id=session_id, + request_output_len=512, + interactive_mode=True): + if finish_reason == 'length': + gr.Warning('WARNING: exceed session max length.' + ' Please restart the session by reset button.') + if tokens < 0: + gr.Warning('WARNING: running on the old session.' + ' Please restart the session by reset button.') + if state_chatbot[-1][-1] is None: + state_chatbot[-1] = (state_chatbot[-1][0], response) + else: + state_chatbot[-1] = (state_chatbot[-1][0], + state_chatbot[-1][1] + response + ) # piece by piece + yield (state_chatbot, state_chatbot, enable_btn, disable_btn) + + yield (state_chatbot, state_chatbot, disable_btn, enable_btn) + + +def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State, + session_id: int): + """reset the session. + + Args: + instruction_txtbox (str): user's prompt + state_chatbot (Sequence): the chatting history + session_id (int): the session id + """ + state_chatbot = [] + # end the session + for response, tokens, finish_reason in get_streaming_response( + '', + f'{InterFace.api_server_url}/v1/chat/interactive', + session_id=session_id, + request_output_len=0, + interactive_mode=False): + pass + + return ( + state_chatbot, + state_chatbot, + gr.Textbox.update(value=''), + ) + + +def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button, + reset_btn: gr.Button, session_id: int): + """stop the session. + + Args: + instruction_txtbox (str): user's prompt + state_chatbot (Sequence): the chatting history + session_id (int): the session id + """ + yield (state_chatbot, disable_btn, disable_btn) + # end the session + for out in get_streaming_response( + '', + f'{InterFace.api_server_url}/v1/chat/interactive', + session_id=session_id, + request_output_len=0, + stop=True): + pass + time.sleep(0.5) + messages = [] + for qa in state_chatbot: + messages.append(dict(role='user', content=qa[0])) + if qa[1] is not None: + messages.append(dict(role='assistant', content=qa[1])) + for out in get_streaming_response( + messages, + f'{InterFace.api_server_url}/v1/chat/interactive', + session_id=session_id, + request_output_len=0, + interactive_mode=True): + pass + yield (state_chatbot, disable_btn, enable_btn) + + +def run_api_server(api_server_url: str, + server_name: str = 'localhost', + server_port: int = 6006, + batch_size: int = 32): + """chat with AI assistant through web ui. + + Args: + api_server_url (str): restufl api url + server_name (str): the ip address of gradio server + server_port (int): the port of gradio server + batch_size (int): batch size for running Turbomind directly + """ + InterFace.api_server_url = api_server_url + model_names = get_model_list(f'{api_server_url}/v1/models') + model_name = '' + if isinstance(model_names, list) and len(model_names) > 0: + model_name = model_names[0] + else: + raise ValueError('gradio can find a suitable model from restful-api') + + with gr.Blocks(css=CSS, theme=THEME) as demo: + state_chatbot = gr.State([]) + state_session_id = gr.State(0) + + with gr.Column(elem_id='container'): + gr.Markdown('## LMDeploy Playground') + + chatbot = gr.Chatbot(elem_id='chatbot', label=model_name) + instruction_txtbox = gr.Textbox( + placeholder='Please input the instruction', + label='Instruction') + with gr.Row(): + cancel_btn = gr.Button(value='Cancel', interactive=False) + reset_btn = gr.Button(value='Reset') + + send_event = instruction_txtbox.submit(chat_stream_restful, [ + instruction_txtbox, state_chatbot, cancel_btn, reset_btn, + state_session_id + ], [state_chatbot, chatbot, cancel_btn, reset_btn]) + instruction_txtbox.submit( + lambda: gr.Textbox.update(value=''), + [], + [instruction_txtbox], + ) + cancel_btn.click( + cancel_restful_func, + [state_chatbot, cancel_btn, reset_btn, state_session_id], + [state_chatbot, cancel_btn, reset_btn], + cancels=[send_event]) + + reset_btn.click(reset_restful_func, + [instruction_txtbox, state_chatbot, state_session_id], + [state_chatbot, chatbot, instruction_txtbox], + cancels=[send_event]) + + def init(): + with InterFace.lock: + InterFace.global_session_id += 1 + new_session_id = InterFace.global_session_id + return new_session_id + + demo.load(init, inputs=None, outputs=[state_session_id]) + + print(f'server is gonna mount on: http://{server_name}:{server_port}') + demo.queue(concurrency_count=batch_size, max_size=100, + api_open=True).launch( + max_threads=10, + share=True, + server_port=server_port, + server_name=server_name, + ) diff --git a/lmdeploy/serve/gradio/app.py b/lmdeploy/serve/gradio/app.py index 71db7a2749..5b1668224d 100644 --- a/lmdeploy/serve/gradio/app.py +++ b/lmdeploy/serve/gradio/app.py @@ -1,542 +1,41 @@ # Copyright (c) OpenMMLab. All rights reserved. -import os -import threading -import time -from functools import partial -from typing import Sequence - -import fire -import gradio as gr - -from lmdeploy.serve.async_engine import AsyncEngine -from lmdeploy.serve.gradio.css import CSS -from lmdeploy.serve.openai.api_client import (get_model_list, - get_streaming_response) -from lmdeploy.serve.openai.api_server import ip2id -from lmdeploy.serve.turbomind.chatbot import Chatbot - -THEME = gr.themes.Soft( - primary_hue=gr.themes.colors.blue, - secondary_hue=gr.themes.colors.sky, - font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif']) - -enable_btn = gr.Button.update(interactive=True) -disable_btn = gr.Button.update(interactive=False) - - -def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot, - request: gr.Request): - """Chat with AI assistant. - - Args: - instruction (str): user's prompt - state_chatbot (Sequence): the chatting history - llama_chatbot (Chatbot): the instance of a chatbot - request (gr.Request): the request from a user - model_name (str): the name of deployed model - """ - instruction = state_chatbot[-1][0] - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - - bot_response = llama_chatbot.stream_infer( - session_id, instruction, f'{session_id}-{len(state_chatbot)}') - - for status, tokens, _ in bot_response: - state_chatbot[-1] = (state_chatbot[-1][0], tokens) - yield (state_chatbot, state_chatbot, '') - - return (state_chatbot, state_chatbot, '') - - -def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State, - llama_chatbot: gr.State, triton_server_addr: str, - model_name: str): - """reset the session.""" - state_chatbot = [] - log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO') - llama_chatbot = Chatbot(triton_server_addr, - model_name, - log_level=log_level, - display=True) - - return ( - llama_chatbot, - state_chatbot, - state_chatbot, - gr.Textbox.update(value=''), - ) - - -def cancel_func( - instruction_txtbox: gr.Textbox, - state_chatbot: gr.State, - llama_chatbot: gr.State, -): - """cancel the session.""" - session_id = llama_chatbot._session.session_id - llama_chatbot.cancel(session_id) - - return ( - llama_chatbot, - state_chatbot, - ) - - -def add_instruction(instruction, state_chatbot): - state_chatbot = state_chatbot + [(instruction, None)] - return ('', state_chatbot) - - -def run_server(triton_server_addr: str, - server_name: str = 'localhost', - server_port: int = 6006): - """chat with AI assistant through web ui. - - Args: - triton_server_addr (str): the communication address of inference server - server_name (str): the ip address of gradio server - server_port (int): the port of gradio server - """ - with gr.Blocks(css=CSS, theme=THEME) as demo: - log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO') - llama_chatbot = gr.State( - Chatbot(triton_server_addr, log_level=log_level, display=True)) - state_chatbot = gr.State([]) - model_name = llama_chatbot.value.model_name - reset_all = partial(reset_all_func, - model_name=model_name, - triton_server_addr=triton_server_addr) - - with gr.Column(elem_id='container'): - gr.Markdown('## LMDeploy Playground') - - chatbot = gr.Chatbot(elem_id='chatbot', label=model_name) - instruction_txtbox = gr.Textbox( - placeholder='Please input the instruction', - label='Instruction') - with gr.Row(): - cancel_btn = gr.Button(value='Cancel') - reset_btn = gr.Button(value='Reset') - - send_event = instruction_txtbox.submit( - add_instruction, [instruction_txtbox, state_chatbot], - [instruction_txtbox, state_chatbot]).then( - chat_stream, [state_chatbot, llama_chatbot], - [state_chatbot, chatbot]) - - cancel_btn.click(cancel_func, - [instruction_txtbox, state_chatbot, llama_chatbot], - [llama_chatbot, chatbot], - cancels=[send_event]) - - reset_btn.click( - reset_all, [instruction_txtbox, state_chatbot, llama_chatbot], - [llama_chatbot, state_chatbot, chatbot, instruction_txtbox], - cancels=[send_event]) - - print(f'server is gonna mount on: http://{server_name}:{server_port}') - demo.queue(concurrency_count=4, max_size=100, api_open=True).launch( - max_threads=10, - share=True, - server_port=server_port, - server_name=server_name, - ) - - -# a IO interface mananing variables -class InterFace: - async_engine: AsyncEngine = None # for run_local - restful_api_url: str = None # for run_restful - - -def chat_stream_restful( - instruction: str, - state_chatbot: Sequence, - cancel_btn: gr.Button, - reset_btn: gr.Button, - request: gr.Request, -): - """Chat with AI assistant. - - Args: - instruction (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - bot_summarized_response = '' - state_chatbot = state_chatbot + [(instruction, None)] - - yield (state_chatbot, state_chatbot, disable_btn, enable_btn, - f'{bot_summarized_response}'.strip()) - - for response, tokens, finish_reason in get_streaming_response( - instruction, - f'{InterFace.restful_api_url}/generate', - session_id=session_id, - request_output_len=512, - sequence_start=(len(state_chatbot) == 1), - sequence_end=False): - if finish_reason == 'length': - gr.Warning('WARNING: exceed session max length.' - ' Please restart the session by reset button.') - if tokens < 0: - gr.Warning('WARNING: running on the old session.' - ' Please restart the session by reset button.') - if state_chatbot[-1][-1] is None: - state_chatbot[-1] = (state_chatbot[-1][0], response) - else: - state_chatbot[-1] = (state_chatbot[-1][0], - state_chatbot[-1][1] + response - ) # piece by piece - yield (state_chatbot, state_chatbot, enable_btn, disable_btn, - f'{bot_summarized_response}'.strip()) - - yield (state_chatbot, state_chatbot, disable_btn, enable_btn, - f'{bot_summarized_response}'.strip()) - - -def reset_restful_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State, - request: gr.Request): - """reset the session. - - Args: - instruction_txtbox (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - state_chatbot = [] - - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - # end the session - for response, tokens, finish_reason in get_streaming_response( - '', - f'{InterFace.restful_api_url}/generate', - session_id=session_id, - request_output_len=0, - sequence_start=False, - sequence_end=True): - pass - - return ( - state_chatbot, - state_chatbot, - gr.Textbox.update(value=''), - ) - - -def cancel_restful_func(state_chatbot: gr.State, cancel_btn: gr.Button, - reset_btn: gr.Button, request: gr.Request): - """stop the session. - - Args: - instruction_txtbox (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - # end the session - for out in get_streaming_response('', - f'{InterFace.restful_api_url}/generate', - session_id=session_id, - request_output_len=0, - sequence_start=False, - sequence_end=False, - stop=True): - pass - time.sleep(0.5) - messages = [] - for qa in state_chatbot: - messages.append(dict(role='user', content=qa[0])) - if qa[1] is not None: - messages.append(dict(role='assistant', content=qa[1])) - for out in get_streaming_response(messages, - f'{InterFace.restful_api_url}/generate', - session_id=session_id, - request_output_len=0, - sequence_start=True, - sequence_end=False): - pass - return (state_chatbot, disable_btn, enable_btn) - - -def run_restful(restful_api_url: str, - server_name: str = 'localhost', - server_port: int = 6006, - batch_size: int = 32): - """chat with AI assistant through web ui. - - Args: - restful_api_url (str): restufl api url - server_name (str): the ip address of gradio server - server_port (int): the port of gradio server - batch_size (int): batch size for running Turbomind directly - """ - InterFace.restful_api_url = restful_api_url - model_names = get_model_list(f'{restful_api_url}/v1/models') - model_name = '' - if isinstance(model_names, list) and len(model_names) > 0: - model_name = model_names[0] - else: - raise ValueError('gradio can find a suitable model from restful-api') - - with gr.Blocks(css=CSS, theme=THEME) as demo: - state_chatbot = gr.State([]) - - with gr.Column(elem_id='container'): - gr.Markdown('## LMDeploy Playground') - - chatbot = gr.Chatbot(elem_id='chatbot', label=model_name) - instruction_txtbox = gr.Textbox( - placeholder='Please input the instruction', - label='Instruction') - with gr.Row(): - cancel_btn = gr.Button(value='Cancel', interactive=False) - reset_btn = gr.Button(value='Reset') - - send_event = instruction_txtbox.submit( - chat_stream_restful, - [instruction_txtbox, state_chatbot, cancel_btn, reset_btn], - [state_chatbot, chatbot, cancel_btn, reset_btn]) - instruction_txtbox.submit( - lambda: gr.Textbox.update(value=''), - [], - [instruction_txtbox], - ) - cancel_btn.click(cancel_restful_func, - [state_chatbot, cancel_btn, reset_btn], - [state_chatbot, cancel_btn, reset_btn], - cancels=[send_event]) - - reset_btn.click(reset_restful_func, - [instruction_txtbox, state_chatbot], - [state_chatbot, chatbot, instruction_txtbox], - cancels=[send_event]) - - print(f'server is gonna mount on: http://{server_name}:{server_port}') - demo.queue(concurrency_count=batch_size, max_size=100, - api_open=True).launch( - max_threads=10, - share=True, - server_port=server_port, - server_name=server_name, - ) - - -async def chat_stream_local( - instruction: str, - state_chatbot: Sequence, - cancel_btn: gr.Button, - reset_btn: gr.Button, - request: gr.Request, -): - """Chat with AI assistant. - - Args: - instruction (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - bot_summarized_response = '' - state_chatbot = state_chatbot + [(instruction, None)] - - yield (state_chatbot, state_chatbot, disable_btn, enable_btn, - f'{bot_summarized_response}'.strip()) - - async for outputs in InterFace.async_engine.generate( - instruction, - session_id, - stream_response=True, - sequence_start=(len(state_chatbot) == 1)): - response = outputs.response - if outputs.finish_reason == 'length': - gr.Warning('WARNING: exceed session max length.' - ' Please restart the session by reset button.') - if outputs.generate_token_len < 0: - gr.Warning('WARNING: running on the old session.' - ' Please restart the session by reset button.') - if state_chatbot[-1][-1] is None: - state_chatbot[-1] = (state_chatbot[-1][0], response) - else: - state_chatbot[-1] = (state_chatbot[-1][0], - state_chatbot[-1][1] + response - ) # piece by piece - yield (state_chatbot, state_chatbot, enable_btn, disable_btn, - f'{bot_summarized_response}'.strip()) - - yield (state_chatbot, state_chatbot, disable_btn, enable_btn, - f'{bot_summarized_response}'.strip()) - - -async def reset_local_func(instruction_txtbox: gr.Textbox, - state_chatbot: gr.State, request: gr.Request): - """reset the session. - - Args: - instruction_txtbox (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - state_chatbot = [] - - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - # end the session - async for out in InterFace.async_engine.generate('', - session_id, - request_output_len=1, - stream_response=True, - sequence_start=False, - sequence_end=True): - pass - - return ( - state_chatbot, - state_chatbot, - gr.Textbox.update(value=''), - ) - - -async def cancel_local_func(state_chatbot: gr.State, cancel_btn: gr.Button, - reset_btn: gr.Button, request: gr.Request): - """stop the session. - - Args: - instruction_txtbox (str): user's prompt - state_chatbot (Sequence): the chatting history - request (gr.Request): the request from a user - """ - session_id = threading.current_thread().ident - if request is not None: - session_id = ip2id(request.kwargs['client']['host']) - # end the session - async for out in InterFace.async_engine.generate('', - session_id, - request_output_len=0, - stream_response=True, - sequence_start=False, - sequence_end=False, - stop=True): - pass - messages = [] - for qa in state_chatbot: - messages.append(dict(role='user', content=qa[0])) - if qa[1] is not None: - messages.append(dict(role='assistant', content=qa[1])) - async for out in InterFace.async_engine.generate(messages, - session_id, - request_output_len=0, - stream_response=True, - sequence_start=True, - sequence_end=False): - pass - return (state_chatbot, disable_btn, enable_btn) - - -def run_local(model_path: str, - server_name: str = 'localhost', - server_port: int = 6006, - batch_size: int = 4, - tp: int = 1): - """chat with AI assistant through web ui. - - Args: - model_path (str): the path of the deployed model - server_name (str): the ip address of gradio server - server_port (int): the port of gradio server - batch_size (int): batch size for running Turbomind directly - tp (int): tensor parallel for Turbomind - """ - InterFace.async_engine = AsyncEngine(model_path=model_path, - instance_num=batch_size, - tp=tp) - - with gr.Blocks(css=CSS, theme=THEME) as demo: - state_chatbot = gr.State([]) - - with gr.Column(elem_id='container'): - gr.Markdown('## LMDeploy Playground') - - chatbot = gr.Chatbot( - elem_id='chatbot', - label=InterFace.async_engine.tm_model.model_name) - instruction_txtbox = gr.Textbox( - placeholder='Please input the instruction', - label='Instruction') - with gr.Row(): - cancel_btn = gr.Button(value='Cancel', interactive=False) - reset_btn = gr.Button(value='Reset') - - send_event = instruction_txtbox.submit( - chat_stream_local, - [instruction_txtbox, state_chatbot, cancel_btn, reset_btn], - [state_chatbot, chatbot, cancel_btn, reset_btn]) - instruction_txtbox.submit( - lambda: gr.Textbox.update(value=''), - [], - [instruction_txtbox], - ) - cancel_btn.click(cancel_local_func, - [state_chatbot, cancel_btn, reset_btn], - [state_chatbot, cancel_btn, reset_btn], - cancels=[send_event]) - - reset_btn.click(reset_local_func, [instruction_txtbox, state_chatbot], - [state_chatbot, chatbot, instruction_txtbox], - cancels=[send_event]) - - print(f'server is gonna mount on: http://{server_name}:{server_port}') - demo.queue(concurrency_count=batch_size, max_size=100, - api_open=True).launch( - max_threads=10, - share=True, - server_port=server_port, - server_name=server_name, - ) def run(model_path_or_server: str, - server_name: str = 'localhost', + server_name: str = '0.0.0.0', server_port: int = 6006, batch_size: int = 32, tp: int = 1, - restful_api: bool = False): + **kwargs): """chat with AI assistant through web ui. Args: model_path_or_server (str): the path of the deployed model or the - tritonserver URL or restful api URL. The former is for directly - running service with gradio. The latter is for running with - tritonserver by default. If the input URL is restful api. Please - enable another flag `restful_api`. + tritonserver URL or restful api URL. For example: + - ./workspace + - 0.0.0.0:23333 + - http://0.0.0.0:23333 server_name (str): the ip address of gradio server server_port (int): the port of gradio server batch_size (int): batch size for running Turbomind directly tp (int): tensor parallel for Turbomind - restufl_api (bool): a flag for model_path_or_server """ if ':' in model_path_or_server: - if restful_api: - run_restful(model_path_or_server, server_name, server_port, - batch_size) + if 'http:' in model_path_or_server: + from lmdeploy.serve.gradio.api_server_backend import run_api_server + run_api_server(model_path_or_server, server_name, server_port, + batch_size) else: - run_server(model_path_or_server, server_name, server_port) + from lmdeploy.serve.gradio.triton_server_backend import \ + run_triton_server + run_triton_server(model_path_or_server, server_name, server_port) else: + from lmdeploy.serve.gradio.turbomind_coupled import run_local run_local(model_path_or_server, server_name, server_port, batch_size, tp) if __name__ == '__main__': + import fire + fire.Fire(run) diff --git a/lmdeploy/serve/gradio/constants.py b/lmdeploy/serve/gradio/constants.py new file mode 100644 index 0000000000..891c572e5a --- /dev/null +++ b/lmdeploy/serve/gradio/constants.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import gradio as gr + +CSS = """ +#container { + width: 95%; + margin-left: auto; + margin-right: auto; +} + +#chatbot { + height: 500px; + overflow: auto; +} + +.chat_wrap_space { + margin-left: 0.5em +} +""" + +THEME = gr.themes.Soft( + primary_hue=gr.themes.colors.blue, + secondary_hue=gr.themes.colors.sky, + font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif']) + +enable_btn = gr.Button.update(interactive=True) +disable_btn = gr.Button.update(interactive=False) diff --git a/lmdeploy/serve/gradio/css.py b/lmdeploy/serve/gradio/css.py deleted file mode 100644 index b3bd233222..0000000000 --- a/lmdeploy/serve/gradio/css.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -CSS = """ -#container { - width: 95%; - margin-left: auto; - margin-right: auto; -} - -#chatbot { - height: 500px; - overflow: auto; -} - -.chat_wrap_space { - margin-left: 0.5em -} -""" diff --git a/lmdeploy/serve/gradio/triton_server_backend.py b/lmdeploy/serve/gradio/triton_server_backend.py new file mode 100644 index 0000000000..9148903cc5 --- /dev/null +++ b/lmdeploy/serve/gradio/triton_server_backend.py @@ -0,0 +1,143 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from functools import partial +from threading import Lock +from typing import Sequence + +import gradio as gr + +from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn +from lmdeploy.serve.turbomind.chatbot import Chatbot + + +class InterFace: + global_session_id: int = 0 + lock = Lock() + + +def chat_stream(state_chatbot: Sequence, llama_chatbot: Chatbot, + cancel_btn: gr.Button, reset_btn: gr.Button, session_id: int): + """Chat with AI assistant. + + Args: + instruction (str): user's prompt + state_chatbot (Sequence): the chatting history + llama_chatbot (Chatbot): the instance of a chatbot + cancel_btn (bool): enable the cancel button or not + reset_btn (bool): enable the reset button or not + session_id (int): the session id + """ + instruction = state_chatbot[-1][0] + + bot_response = llama_chatbot.stream_infer( + session_id, instruction, f'{session_id}-{len(state_chatbot)}') + + for status, tokens, _ in bot_response: + state_chatbot[-1] = (state_chatbot[-1][0], tokens) + yield (state_chatbot, state_chatbot, enable_btn, disable_btn) + + yield (state_chatbot, state_chatbot, disable_btn, enable_btn) + + +def reset_all_func(instruction_txtbox: gr.Textbox, state_chatbot: gr.State, + llama_chatbot: gr.State, triton_server_addr: str, + model_name: str): + """reset the session.""" + state_chatbot = [] + log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO') + llama_chatbot = Chatbot(triton_server_addr, + model_name, + log_level=log_level, + display=True) + + return ( + llama_chatbot, + state_chatbot, + state_chatbot, + gr.Textbox.update(value=''), + ) + + +def cancel_func( + state_chatbot: gr.State, + llama_chatbot: gr.State, + cancel_btn: gr.Button, + reset_btn: gr.Button, +): + """cancel the session.""" + yield (llama_chatbot, state_chatbot, disable_btn, disable_btn) + session_id = llama_chatbot._session.session_id + llama_chatbot.cancel(session_id) + + yield (llama_chatbot, state_chatbot, disable_btn, enable_btn) + + +def add_instruction(instruction, state_chatbot): + state_chatbot = state_chatbot + [(instruction, None)] + return ('', state_chatbot) + + +def run_triton_server(triton_server_addr: str, + server_name: str = 'localhost', + server_port: int = 6006): + """chat with AI assistant through web ui. + + Args: + triton_server_addr (str): the communication address of inference server + server_name (str): the ip address of gradio server + server_port (int): the port of gradio server + """ + with gr.Blocks(css=CSS, theme=THEME) as demo: + log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO') + llama_chatbot = gr.State( + Chatbot(triton_server_addr, log_level=log_level, display=True)) + state_chatbot = gr.State([]) + state_session_id = gr.State(0) + model_name = llama_chatbot.value.model_name + reset_all = partial(reset_all_func, + model_name=model_name, + triton_server_addr=triton_server_addr) + + with gr.Column(elem_id='container'): + gr.Markdown('## LMDeploy Playground') + + chatbot = gr.Chatbot(elem_id='chatbot', label=model_name) + instruction_txtbox = gr.Textbox( + placeholder='Please input the instruction', + label='Instruction') + with gr.Row(): + cancel_btn = gr.Button(value='Cancel', interactive=False) + reset_btn = gr.Button(value='Reset') + + send_event = instruction_txtbox.submit( + add_instruction, [instruction_txtbox, state_chatbot], + [instruction_txtbox, state_chatbot]).then(chat_stream, [ + state_chatbot, llama_chatbot, cancel_btn, reset_btn, + state_session_id + ], [state_chatbot, chatbot, cancel_btn, reset_btn]) + + cancel_btn.click(cancel_func, + [state_chatbot, llama_chatbot, cancel_btn, reset_btn], + [llama_chatbot, chatbot, cancel_btn, reset_btn], + cancels=[send_event]) + + reset_btn.click( + reset_all, [instruction_txtbox, state_chatbot, llama_chatbot], + [llama_chatbot, state_chatbot, chatbot, instruction_txtbox], + cancels=[send_event]) + + def init(): + with InterFace.lock: + InterFace.global_session_id += 1 + new_session_id = InterFace.global_session_id + return new_session_id + + demo.load(init, inputs=None, outputs=[state_session_id]) + + print(f'server is gonna mount on: http://{server_name}:{server_port}') + demo.queue(concurrency_count=4, max_size=100, api_open=True).launch( + max_threads=10, + share=True, + server_port=server_port, + server_name=server_name, + ) diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py new file mode 100644 index 0000000000..e344abcbda --- /dev/null +++ b/lmdeploy/serve/gradio/turbomind_coupled.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from threading import Lock +from typing import Sequence + +import gradio as gr + +from lmdeploy.serve.async_engine import AsyncEngine +from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn + + +class InterFace: + async_engine: AsyncEngine = None + global_session_id: int = 0 + lock = Lock() + + +async def chat_stream_local( + instruction: str, + state_chatbot: Sequence, + cancel_btn: gr.Button, + reset_btn: gr.Button, + session_id: int, +): + """Chat with AI assistant. + + Args: + instruction (str): user's prompt + state_chatbot (Sequence): the chatting history + cancel_btn (gr.Button): the cancel button + reset_btn (gr.Button): the reset button + session_id (int): the session id + """ + state_chatbot = state_chatbot + [(instruction, None)] + + yield (state_chatbot, state_chatbot, disable_btn, enable_btn) + + async for outputs in InterFace.async_engine.generate( + instruction, + session_id, + stream_response=True, + sequence_start=(len(state_chatbot) == 1), + sequence_end=False): + response = outputs.response + if outputs.finish_reason == 'length': + gr.Warning('WARNING: exceed session max length.' + ' Please restart the session by reset button.') + if outputs.generate_token_len < 0: + gr.Warning('WARNING: running on the old session.' + ' Please restart the session by reset button.') + if state_chatbot[-1][-1] is None: + state_chatbot[-1] = (state_chatbot[-1][0], response) + else: + state_chatbot[-1] = (state_chatbot[-1][0], + state_chatbot[-1][1] + response + ) # piece by piece + yield (state_chatbot, state_chatbot, enable_btn, disable_btn) + + yield (state_chatbot, state_chatbot, disable_btn, enable_btn) + + +async def reset_local_func(instruction_txtbox: gr.Textbox, + state_chatbot: Sequence, session_id: int): + """reset the session. + + Args: + instruction_txtbox (str): user's prompt + state_chatbot (Sequence): the chatting history + session_id (int): the session id + """ + state_chatbot = [] + # end the session + async for out in InterFace.async_engine.generate('', + session_id, + request_output_len=1, + stream_response=True, + sequence_start=False, + sequence_end=True): + pass + return (state_chatbot, state_chatbot, gr.Textbox.update(value='')) + + +async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button, + reset_btn: gr.Button, session_id: int): + """stop the session. + + Args: + instruction_txtbox (str): user's prompt + state_chatbot (Sequence): the chatting history + cancel_btn (gr.Button): the cancel button + reset_btn (gr.Button): the reset button + session_id (int): the session id + """ + yield (state_chatbot, disable_btn, enable_btn) + async for out in InterFace.async_engine.generate('', + session_id, + request_output_len=0, + stream_response=True, + sequence_start=False, + sequence_end=False, + stop=True): + pass + messages = [] + for qa in state_chatbot: + messages.append(dict(role='user', content=qa[0])) + if qa[1] is not None: + messages.append(dict(role='assistant', content=qa[1])) + async for out in InterFace.async_engine.generate(messages, + session_id, + request_output_len=0, + stream_response=True, + sequence_start=True, + sequence_end=False): + pass + yield (state_chatbot, disable_btn, enable_btn) + + +def run_local(model_path: str, + server_name: str = 'localhost', + server_port: int = 6006, + batch_size: int = 4, + tp: int = 1): + """chat with AI assistant through web ui. + + Args: + model_path (str): the path of the deployed model + server_name (str): the ip address of gradio server + server_port (int): the port of gradio server + batch_size (int): batch size for running Turbomind directly + tp (int): tensor parallel for Turbomind + """ + InterFace.async_engine = AsyncEngine(model_path=model_path, + instance_num=batch_size, + tp=tp) + + with gr.Blocks(css=CSS, theme=THEME) as demo: + state_chatbot = gr.State([]) + state_session_id = gr.State(0) + + with gr.Column(elem_id='container'): + gr.Markdown('## LMDeploy Playground') + + chatbot = gr.Chatbot( + elem_id='chatbot', + label=InterFace.async_engine.tm_model.model_name) + instruction_txtbox = gr.Textbox( + placeholder='Please input the instruction', + label='Instruction') + with gr.Row(): + cancel_btn = gr.Button(value='Cancel', interactive=False) + reset_btn = gr.Button(value='Reset') + + send_event = instruction_txtbox.submit(chat_stream_local, [ + instruction_txtbox, state_chatbot, cancel_btn, reset_btn, + state_session_id + ], [state_chatbot, chatbot, cancel_btn, reset_btn]) + instruction_txtbox.submit( + lambda: gr.Textbox.update(value=''), + [], + [instruction_txtbox], + ) + cancel_btn.click( + cancel_local_func, + [state_chatbot, cancel_btn, reset_btn, state_session_id], + [state_chatbot, cancel_btn, reset_btn], + cancels=[send_event]) + + reset_btn.click(reset_local_func, + [instruction_txtbox, state_chatbot, state_session_id], + [state_chatbot, chatbot, instruction_txtbox], + cancels=[send_event]) + + def init(): + with InterFace.lock: + InterFace.global_session_id += 1 + new_session_id = InterFace.global_session_id + return new_session_id + + demo.load(init, inputs=None, outputs=[state_session_id]) + + print(f'server is gonna mount on: http://{server_name}:{server_port}') + demo.queue(concurrency_count=batch_size, max_size=100, + api_open=True).launch( + max_threads=10, + share=True, + server_port=server_port, + server_name=server_name, + ) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index a8718331be..a1610e05ea 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -1,8 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import json -from typing import Iterable, List +from typing import Any, Dict, Iterable, List, Optional, Union -import fire import requests @@ -15,13 +14,306 @@ def get_model_list(api_url: str): return None +class APIClient: + """Chatbot for LLaMA series models with turbomind as inference engine. + + Args: + api_server_url (str): communicating address 'http://:' of + api_server + """ + + def __init__(self, api_server_url: str, **kwargs): + self.api_server_url = api_server_url + self.chat_intractive_v1_url = f'{api_server_url}/v1/chat/interactive' + self.chat_completions_v1_url = f'{api_server_url}/v1/chat/completions' + self.completions_v1_url = f'{api_server_url}/v1/completions' + self.models_v1_url = f'{api_server_url}/v1/models' + self._available_models = None + + @property + def available_models(self): + """Show available models.""" + if self._available_models is not None: + return self._available_models + response = requests.get(self.models_v1_url) + if hasattr(response, 'text'): + model_list = json.loads(response.text) + model_list = model_list.pop('data', []) + self._available_models = [item['id'] for item in model_list] + return self._available_models + return None + + def chat_completions_v1(self, + model: str, + messages: Union[str, List[Dict[str, str]]], + temperature: Optional[float] = 0.7, + top_p: Optional[float] = 1.0, + n: Optional[int] = 1, + max_tokens: Optional[int] = 512, + stop: Optional[bool] = False, + stream: Optional[bool] = False, + presence_penalty: Optional[float] = 0.0, + frequency_penalty: Optional[float] = 0.0, + user: Optional[str] = None, + repetition_penalty: Optional[float] = 1.0, + session_id: Optional[int] = -1, + ignore_eos: Optional[bool] = False, + **kwargs): + """Chat completion v1. + + Args: + model: model name. Available from self.available_models. + messages: string prompt or chat history in OpenAI format. + temperature (float): to modulate the next token probability + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or + higher are kept for generation. + n (int): How many chat completion choices to generate for each + input message. Only support one here. + stream: whether to stream the results or not. Default to false. + max_tokens (int): output token nums + repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + ignore_eos (bool): indicator for ignoring eos + session_id (int): if not specified, will set random value + + Yields: + json objects in openai formats + """ + pload = { + k: v + for k, v in locals().copy().items() + if k[:2] != '__' and k not in ['self'] + } + headers = {'content-type': 'application/json'} + response = requests.post(self.chat_completions_v1_url, + headers=headers, + json=pload, + stream=stream) + for chunk in response.iter_lines(chunk_size=8192, + decode_unicode=False, + delimiter=b'\n'): + if chunk: + if stream: + decoded = chunk.decode('utf-8') + if decoded == 'data: [DONE]': + continue + if decoded[:6] == 'data: ': + decoded = decoded[6:] + output = json.loads(decoded) + yield output + else: + decoded = chunk.decode('utf-8') + output = json.loads(decoded) + yield output + + def chat_interactive_v1(self, + prompt: Union[str, List[Dict[str, str]]], + session_id: int = -1, + interactive_mode: bool = False, + stream: bool = False, + stop: bool = False, + request_output_len: int = 512, + top_p: float = 0.8, + top_k: int = 40, + temperature: float = 0.8, + repetition_penalty: float = 1.0, + ignore_eos: bool = False, + **kwargs): + """Interactive completions. + + - On interactive mode, the chat history is kept on the server. Please + set `interactive_mode = True`. + - On normal mode, no chat history is kept on the server. Set + `interactive_mode = False`. + + Args: + prompt: the prompt to use for the generation. + session_id: determine which instance will be called. + If not specified with a value other than -1, using random value + directly. + interactive_mode (bool): turn on interactive mode or not. On + interactive mode, session history is kept on the server (and + vice versa). + stream: whether to stream the results or not. + stop: whether to stop the session response or not. + request_output_len (int): output token nums + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or + higher are kept for generation. + top_k (int): The number of the highest probability vocabulary + tokens to keep for top-k-filtering + temperature (float): to modulate the next token probability + repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + ignore_eos (bool): indicator for ignoring eos + + Yields: + json objects consist of text, tokens, finish_reason + """ + pload = { + k: v + for k, v in locals().copy().items() + if k[:2] != '__' and k not in ['self'] + } + headers = {'content-type': 'application/json'} + response = requests.post(self.chat_intractive_v1_url, + headers=headers, + json=pload, + stream=stream) + for chunk in response.iter_lines(chunk_size=8192, + decode_unicode=False, + delimiter=b'\n'): + if chunk: + decoded = chunk.decode('utf-8') + output = json.loads(decoded) + yield output + + def completions_v1( + self, + model: str, + prompt: Union[str, List[Any]], + suffix: Optional[str] = None, + temperature: Optional[float] = 0.7, + n: Optional[int] = 1, + max_tokens: Optional[int] = 16, + stream: Optional[bool] = False, + top_p: Optional[float] = 1.0, + user: Optional[str] = None, + # additional argument of lmdeploy + repetition_penalty: Optional[float] = 1.0, + session_id: Optional[int] = -1, + ignore_eos: Optional[bool] = False, + **kwargs): + """Chat completion v1. + + Args: + model (str): model name. Available from /v1/models. + prompt (str): the input prompt. + suffix (str): The suffix that comes after a completion of inserted + text. + max_tokens (int): output token nums + temperature (float): to modulate the next token probability + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or + higher are kept for generation. + n (int): How many chat completion choices to generate for each + input message. Only support one here. + stream: whether to stream the results or not. Default to false. + repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + user (str): A unique identifier representing your end-user. + ignore_eos (bool): indicator for ignoring eos + session_id (int): if not specified, will set random value + + Yields: + json objects in openai formats + """ + pload = { + k: v + for k, v in locals().copy().items() + if k[:2] != '__' and k not in ['self'] + } + headers = {'content-type': 'application/json'} + response = requests.post(self.completions_v1_url, + headers=headers, + json=pload, + stream=stream) + for chunk in response.iter_lines(chunk_size=8192, + decode_unicode=False, + delimiter=b'\n'): + if chunk: + if stream: + decoded = chunk.decode('utf-8')[6:] + if decoded == 'data: [DONE]': + continue + if decoded[:6] == 'data: ': + decoded = decoded[6:] + output = json.loads(decoded) + yield output + else: + decoded = chunk.decode('utf-8') + output = json.loads(decoded) + yield output + + def chat(self, + prompt: str, + session_id: int, + request_output_len: int = 512, + stream: bool = False, + top_p: float = 0.8, + top_k: int = 40, + temperature: float = 0.8, + repetition_penalty: float = 1.0, + ignore_eos: bool = False): + """Chat with a unique session_id. + + Args: + prompt: the prompt to use for the generation. + session_id: determine which instance will be called. + If not specified with a value other than -1, using random value + directly. + stream: whether to stream the results or not. + stop: whether to stop the session response or not. + request_output_len (int): output token nums + top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or + higher are kept for generation. + top_k (int): The number of the highest probability vocabulary + tokens to keep for top-k-filtering + temperature (float): to modulate the next token probability + repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + ignore_eos (bool): indicator for ignoring eos + + Yields: + text, tokens, finish_reason + """ + assert session_id != -1, 'please set a value other than -1' + for outputs in self.chat_interactive_v1( + prompt, + session_id=session_id, + request_output_len=request_output_len, + interactive_mode=True, + stream=stream, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + ignore_eos=ignore_eos): + if outputs['finish_reason'] == 'length': + print('WARNING: exceed session max length.' + ' Please end the session.') + yield outputs['text'], outputs['tokens'], outputs['finish_reason'] + + def end_session(self, session_id: int): + """End the session with a unique session_id. + + Args: + session_id: determine which instance will be called. + If not specified with a value other than -1, using random value + directly. + """ + for out in self.chat_interactive_v1(prompt='', + session_id=session_id, + request_output_len=0, + interactive_mode=False): + pass + + +def input_prompt(): + """Input a prompt in the consolo interface.""" + print('\ndouble enter to end input >>> ', end='') + sentinel = '' # ends when this string is seen + return '\n'.join(iter(input, sentinel)) + + def get_streaming_response(prompt: str, api_url: str, session_id: int, request_output_len: int = 512, stream: bool = True, - sequence_start: bool = True, - sequence_end: bool = True, + interactive_mode: bool = False, ignore_eos: bool = False, stop: bool = False) -> Iterable[List[str]]: headers = {'User-Agent': 'Test Client'} @@ -30,8 +322,7 @@ def get_streaming_response(prompt: str, 'stream': stream, 'session_id': session_id, 'request_output_len': request_output_len, - 'sequence_start': sequence_start, - 'sequence_end': sequence_end, + 'interactive_mode': interactive_mode, 'ignore_eos': ignore_eos, 'stop': stop } @@ -50,43 +341,26 @@ def get_streaming_response(prompt: str, yield output, tokens, finish_reason -def input_prompt(): - """Input a prompt in the consolo interface.""" - print('\ndouble enter to end input >>> ', end='') - sentinel = '' # ends when this string is seen - return '\n'.join(iter(input, sentinel)) - - -def main(restful_api_url: str, session_id: int = 0): - nth_round = 1 +def main(api_server_url: str, session_id: int = 0): + api_client = APIClient(api_server_url) while True: prompt = input_prompt() - if prompt == 'exit': - for output, tokens, finish_reason in get_streaming_response( - '', - f'{restful_api_url}/generate', - session_id=session_id, - request_output_len=0, - sequence_start=(nth_round == 1), - sequence_end=True): - pass - exit(0) + if prompt in ['exit', 'end']: + api_client.end_session(session_id) + if prompt == 'exit': + exit(0) else: - for output, tokens, finish_reason in get_streaming_response( + for text, tokens, finish_reason in api_client.chat( prompt, - f'{restful_api_url}/generate', session_id=session_id, request_output_len=512, - sequence_start=(nth_round == 1), - sequence_end=False): + stream=True): if finish_reason == 'length': - print('WARNING: exceed session max length.' - ' Please end the session.') continue - print(output, end='') - - nth_round += 1 + print(text, end='') if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 94271c4b9b..97e5e518c9 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import asyncio import os +import random import time from http import HTTPStatus from typing import AsyncGenerator, List, Optional -import fire import uvicorn from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware @@ -14,8 +15,10 @@ from lmdeploy.serve.openai.protocol import ( # noqa: E501 ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, ChatMessage, DeltaMessage, EmbeddingsRequest, - EmbeddingsResponse, ErrorResponse, GenerateRequest, GenerateResponse, + ChatCompletionStreamResponse, ChatMessage, CompletionRequest, + CompletionResponse, CompletionResponseChoice, + CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage, + EmbeddingsRequest, ErrorResponse, GenerateRequest, GenerateResponse, ModelCard, ModelList, ModelPermission, UsageInfo) os.environ['TM_LOG_LEVEL'] = 'ERROR' @@ -105,9 +108,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, 1.0 means no penalty Additional arguments supported by LMDeploy: - - renew_session (bool): Whether renew the session. Can be used when the - session length is exceeded. - ignore_eos (bool): indicator for ignoring eos + - session_id (int): if not specified, will set random value Currently we do not support the following features: - function_call (Users should implement this by themselves) @@ -115,20 +117,22 @@ async def chat_completions_v1(request: ChatCompletionRequest, - presence_penalty (replaced with repetition_penalty) - frequency_penalty (replaced with repetition_penalty) """ - session_id = ip2id(raw_request.client.host) + if request.session_id == -1: + request.session_id = random.randint(1, 10086) error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret model_name = request.model - request_id = str(session_id) + request_id = str(request.session_id) created_time = int(time.time()) - result_generator = VariableInterface.async_engine.generate_openai( + result_generator = VariableInterface.async_engine.generate( request.messages, - session_id, + request.session_id, True, # always use stream to enable batching - request.renew_session, + sequence_start=True, + sequence_end=True, request_output_len=request.max_tokens if request.max_tokens else 512, stop=request.stop, top_p=request.top_p, @@ -189,7 +193,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: async for res in result_generator: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(session_id) + VariableInterface.async_engine.stop_session(request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') final_res = res @@ -223,43 +227,191 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: return response -@app.post('/v1/embeddings') -async def create_embeddings(request: EmbeddingsRequest, - raw_request: Request = None): - """Creates embeddings for the text.""" +@app.post('/v1/completions') +async def completions_v1(request: CompletionRequest, + raw_request: Request = None): + """Completion API similar to OpenAI's API. + + Go to `https://platform.openai.com/docs/api-reference/completions/create` + for the API specification. + + The request should be a JSON object with the following fields: + - model (str): model name. Available from /v1/models. + - prompt (str): the input prompt. + - suffix (str): The suffix that comes after a completion of inserted text. + - max_tokens (int): output token nums + - temperature (float): to modulate the next token probability + - top_p (float): If set to float < 1, only the smallest set of most + probable tokens with probabilities that add up to top_p or higher + are kept for generation. + - n (int): How many chat completion choices to generate for each input + message. Only support one here. + - stream: whether to stream the results or not. Default to false. + - repetition_penalty (float): The parameter for repetition penalty. + 1.0 means no penalty + - user (str): A unique identifier representing your end-user. + + Additional arguments supported by LMDeploy: + - ignore_eos (bool): indicator for ignoring eos + - session_id (int): if not specified, will set random value + + Currently we do not support the following features: + - logprobs (not supported yet) + - presence_penalty (replaced with repetition_penalty) + - frequency_penalty (replaced with repetition_penalty) + """ + if request.session_id == -1: + request.session_id = random.randint(1, 10086) error_check_ret = await check_request(request) if error_check_ret is not None: return error_check_ret - embedding = await VariableInterface.async_engine.get_embeddings( - request.input) - data = [{'object': 'embedding', 'embedding': embedding, 'index': 0}] - token_num = len(embedding) - return EmbeddingsResponse( - data=data, - model=request.model, - usage=UsageInfo( - prompt_tokens=token_num, - total_tokens=token_num, - completion_tokens=None, - ), - ).dict(exclude_none=True) - - -@app.post('/generate') -async def generate(request: GenerateRequest, raw_request: Request = None): + model_name = request.model + request_id = str(request.session_id) + created_time = int(time.time()) + if isinstance(request.prompt, str): + request.prompt = [request.prompt] + generators = [] + for i in range(len(request.prompt)): + result_generator = VariableInterface.async_engine.generate( + request.prompt[i], + request.session_id + i, + True, # always use stream to enable batching + sequence_start=True, + sequence_end=True, + request_output_len=request.max_tokens + if request.max_tokens else 512, + stop=False, + top_p=request.top_p, + temperature=request.temperature, + repetition_penalty=request.repetition_penalty, + ignore_eos=request.ignore_eos, + do_preprocess=False) + generators.append(result_generator) + + def create_stream_response_json( + index: int, + text: str, + finish_reason: Optional[str] = None, + ) -> str: + choice_data = CompletionResponseStreamChoice( + index=index, + text=text, + finish_reason=finish_reason, + ) + response = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[choice_data], + ) + response_json = response.model_dump_json() + + return response_json + + async def completion_stream_generator() -> AsyncGenerator[str, None]: + # First chunk with role + for generator in generators: + for i in range(request.n): + choice_data = CompletionResponseStreamChoice( + index=i, + text='', + finish_reason=None, + ) + chunk = CompletionStreamResponse(id=request_id, + choices=[choice_data], + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f'data: {data}\n\n' + + async for res in generator: + response_json = create_stream_response_json( + index=0, + text=res.response, + ) + yield f'data: {response_json}\n\n' + yield 'data: [DONE]\n\n' + + # Streaming response + if request.stream: + return StreamingResponse(completion_stream_generator(), + media_type='text/event-stream') + + # Non-streaming response + usage = UsageInfo() + choices = [] + + async def _inner_call(i, generator): + final_res = None + text = '' + async for res in generator: + if await raw_request.is_disconnected(): + # Abort the request if the client disconnects. + VariableInterface.async_engine.stop_session(request.session_id) + return create_error_response(HTTPStatus.BAD_REQUEST, + 'Client disconnected') + final_res = res + text += res.response + assert final_res is not None + choice_data = CompletionResponseChoice( + index=0, + text=text, + finish_reason=final_res.finish_reason, + ) + choices.append(choice_data) + + total_tokens = sum([ + final_res.history_token_len, final_res.input_token_len, + final_res.generate_token_len + ]) + usage.prompt_tokens += final_res.input_token_len + usage.completion_tokens += final_res.generate_token_len + usage.total_tokens += total_tokens + + await asyncio.gather( + *[_inner_call(i, generators[i]) for i in range(len(generators))]) + + response = CompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + ) + + return response + + +@app.post('/v1/embeddings', tags=['unsupported']) +async def create_embeddings(request: EmbeddingsRequest, + raw_request: Request = None): + """Creates embeddings for the text.""" + return create_error_response(HTTPStatus.BAD_REQUEST, + 'Unsupported by turbomind.') + + +@app.post('/generate', + tags=['deprecated'], + description='please use /v1/chat/interactive') +@app.post('/v1/chat/interactive') +async def chat_interactive_v1(request: GenerateRequest, + raw_request: Request = None): """Generate completion for the request. + - On interactive mode, the chat history is kept on the server. Please set + `interactive_mode = True`. + - On normal mode, no chat history is kept on the server. Set + `interactive_mode = False`. + The request should be a JSON object with the following fields: - prompt: the prompt to use for the generation. - session_id: determine which instance will be called. If not specified - with a value other than -1, using host ip directly. - - sequence_start (bool): indicator for starting a sequence. - - sequence_end (bool): indicator for ending a sequence + with a value other than -1, using random value directly. + - interactive_mode (bool): turn on interactive mode or not. On interactive + mode, session history is kept on the server (and vice versa). - stream: whether to stream the results or not. - stop: whether to stop the session response or not. - request_output_len (int): output token nums - - step (int): the offset of the k/v cache - top_p (float): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -271,15 +423,18 @@ async def generate(request: GenerateRequest, raw_request: Request = None): - ignore_eos (bool): indicator for ignoring eos """ if request.session_id == -1: - session_id = ip2id(raw_request.client.host) - request.session_id = session_id + request.session_id = random.randint(10087, 23333) - generation = VariableInterface.async_engine.generate( + async_engine = VariableInterface.async_engine + sequence_start = async_engine.steps.get(str(request.session_id), 0) == 0 + sequence_end = not request.interactive_mode + + generation = async_engine.generate( request.prompt, request.session_id, stream_response=True, # always use stream to enable batching - sequence_start=request.sequence_start, - sequence_end=request.sequence_end, + sequence_start=sequence_start, + sequence_end=sequence_end, request_output_len=request.request_output_len, top_p=request.top_p, top_k=request.top_k, @@ -308,7 +463,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]: async for out in generation: if await raw_request.is_disconnected(): # Abort the request if the client disconnects. - VariableInterface.async_engine.stop_session(session_id) + async_engine.stop_session(request.session_id) return create_error_response(HTTPStatus.BAD_REQUEST, 'Client disconnected') text += out.response @@ -319,14 +474,15 @@ async def stream_results() -> AsyncGenerator[bytes, None]: def main(model_path: str, - server_name: str = 'localhost', + server_name: str = '0.0.0.0', server_port: int = 23333, instance_num: int = 32, tp: int = 1, allow_origins: List[str] = ['*'], allow_credentials: bool = True, allow_methods: List[str] = ['*'], - allow_headers: List[str] = ['*']): + allow_headers: List[str] = ['*'], + **kwargs): """An example to perform model inference through the command line interface. @@ -352,9 +508,12 @@ def main(model_path: str, VariableInterface.async_engine = AsyncEngine(model_path=model_path, instance_num=instance_num, - tp=tp) + tp=tp, + **kwargs) uvicorn.run(app=app, host=server_name, port=server_port, log_level='info') if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 756af1a4ca..bee2e2c91c 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -70,7 +70,7 @@ class ChatCompletionRequest(BaseModel): user: Optional[str] = None # additional argument of lmdeploy repetition_penalty: Optional[float] = 1.0 - renew_session: Optional[bool] = False + session_id: Optional[int] = -1 ignore_eos: Optional[bool] = False @@ -135,6 +135,10 @@ class CompletionRequest(BaseModel): presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 user: Optional[str] = None + # additional argument of lmdeploy + repetition_penalty: Optional[float] = 1.0 + session_id: Optional[int] = -1 + ignore_eos: Optional[bool] = False class CompletionResponseChoice(BaseModel): @@ -175,7 +179,7 @@ class CompletionStreamResponse(BaseModel): class EmbeddingsRequest(BaseModel): """Embedding request.""" model: str = None - input: Union[str, List[Any]] + input: Union[str, List[str]] user: Optional[str] = None @@ -191,8 +195,7 @@ class GenerateRequest(BaseModel): """Generate request.""" prompt: Union[str, List[Dict[str, str]]] session_id: int = -1 - sequence_start: bool = True - sequence_end: bool = False + interactive_mode: bool = False stream: bool = False stop: bool = False request_output_len: int = 512 diff --git a/lmdeploy/serve/turbomind/chatbot.py b/lmdeploy/serve/turbomind/chatbot.py index cc12fcff3b..5b89cc506a 100644 --- a/lmdeploy/serve/turbomind/chatbot.py +++ b/lmdeploy/serve/turbomind/chatbot.py @@ -459,6 +459,10 @@ def _stream_infer(self, session.sequence_length = 0 input_ids, input_lengths = self.preprocess(prompt) + # will crash if last_token_id == eos_id and send empty input_ids + if sequence_end and request_output_len == 0: + input_ids = np.array([[self.bos_id]], dtype=np.uint32) + input_lengths = np.array([[1]], dtype=np.uint32) input_tokens = input_lengths.squeeze() if self.profile_generation: yield StatusCode.TRITON_STREAM_ING, \ @@ -657,8 +661,13 @@ def stream_consumer(postprocess, res_queue, session, n_input_token, continue output_str = postprocess( output_ids, np.array([[n_token]], dtype=np.uint32)) - n_token = output_ids.shape[-1] text = output_str[0].decode() + # utf-8 char at the end means it's a potential unfinished + # byte sequence, continue to concate it with the next + # sequence and decode them together + if text.endswith('�'): + continue + n_token = output_ids.shape[-1] if display: print(text, end='', flush=True) session.response += text diff --git a/lmdeploy/serve/turbomind/deploy.py b/lmdeploy/serve/turbomind/deploy.py deleted file mode 100644 index cc8db88f5c..0000000000 --- a/lmdeploy/serve/turbomind/deploy.py +++ /dev/null @@ -1,1046 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import configparser -import json -import os -import os.path as osp -import re -import shutil -import sys -from pathlib import Path - -import fire -import safetensors -import torch -from safetensors.torch import load_file -from sentencepiece import SentencePieceProcessor - -import lmdeploy -from lmdeploy.model import MODELS - -supported_formats = ['llama', 'hf', 'awq', 'qwen'] - - -def get_package_root_path(): - import lmdeploy - return Path(lmdeploy.__file__).parent - - -def create_workspace(_path: str): - """Create a workspace. - - Args: - _path (str): the path of the workspace - Returns: - bool: success or not - """ - try: - if osp.exists(_path): - shutil.rmtree(_path) - os.makedirs(_path) - print(f'create workspace in directory {_path}') - return True - except Exception as e: - print(f'create workspace in {_path} failed: {e}') - return False - - -def destroy_workspace(_path: str): - """destroy workspace. - - Args: - _path(str): the path of the workspace - Returns: - bool: success or not - """ - try: - shutil.rmtree(_path) - print(f'destroy workspace in directory {_path}') - return True - except Exception as e: - print(f'destroy workspace in {_path} failed: {e}') - return False - - -def copy_triton_model_templates(_path: str): - """copy triton model templates to the specified path. - - Args: - _path (str): the target path - Returns: - str: the path of the triton models - """ - try: - cur_path = osp.abspath(__file__) - dir_path = osp.dirname(cur_path) - triton_models_path = osp.join(dir_path, 'triton_models') - dst_path = osp.join(_path, 'triton_models') - shutil.copytree(triton_models_path, dst_path, symlinks=True) - print(f'copy triton model templates from "{triton_models_path}" to ' - f'"{dst_path}" successfully') - shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path) - return dst_path - except Exception as e: - print(f'copy triton model templates from "{triton_models_path}"' - f' to "{dst_path}" failed: {e}') - return None - - -def tokenizer_info_sp(model_path: str): - """Return the vocabulary size, bos token id and eos token id. - - Args: - model_path (str): the tokenizer model's path - Returns: - tuple: vocabulary size, bos token id and eos token id - """ - assert os.path.isfile(model_path), model_path - sp_model = SentencePieceProcessor(model_file=model_path) - # BOS / EOS token IDs - n_words = sp_model.vocab_size() - bos_id = sp_model.bos_id() - eos_id = sp_model.eos_id() - return n_words, bos_id, eos_id - - -def tokenizer_info_qwen(model_dir: str): - n_words = 151851 - bos_id = 0 - eos_id = 151643 - return n_words, bos_id, eos_id - - -def load_checkpoint(model_path): - """Load checkpoint files into torch format. - - Args: - model_path (str): the checkpoint folder - Returns: - Dict[str, torch.Tensor]: weight in torch format - """ - suffixes = ['.safetensors', '.bin'] - for suffix in suffixes: - files = [ - file for file in os.listdir(model_path) if file.endswith(suffix) - ] - if len(files) > 0: - break - - assert len(files) > 0, f'could not find checkpoints in {model_path}' - files = sorted(files) - print(files) - params = {} - for file in files: - if file.endswith('.bin'): - tmp = torch.load(osp.join(model_path, file), map_location='cpu') - else: - tmp = load_file(osp.join(model_path, file)) - params.update(tmp) - return params - - -def export(model_name: str, - num_layer: int, - norm_eps: float, - kv_head_num: int, - model_params: dict, - tokenizer_path: str, - out_dir: str, - tp: int, - size_per_head: int = 128, - group_size: int = 0, - weight_type: str = 'fp16', - max_position_embeddings: int = 0, - use_dynamic_ntk: int = 0, - use_logn_attn: int = 0, - rope_theta: float = 10000.0, - tokenizer_info=tokenizer_info_sp): - """Export deploying information to a config file. - - Args: - model_name (str): model's name - num_layer (int): the number of transformer blocks - norm_eps (float): norm epsilon - model_params (dict): parameters of a model - tokenizer_path (str): the tokenizer model's path - out_dir (str): the path of the output directory - tp (int): the number of tensor parallelism - size_per_head (int): the dimension of each head - """ - out_dir = osp.join(out_dir, 'weights') - os.makedirs(out_dir, exist_ok=True) - - def save_bin(param: torch.Tensor, name): - print(name, param.shape) - if param.dtype in [torch.float, torch.bfloat16]: - param = param.half() - param.contiguous().cpu().numpy().tofile(osp.join(out_dir, name)) - - attn_bias = False - inter_size = 0 - - tok_embeddings = model_params['tok_embeddings.weight'] - _vocab_size, dim = tok_embeddings.shape - head_num = dim // size_per_head - if _vocab_size % tp != 0: - # Resolve https://github.com/InternLM/lmdeploy/issues/266 - # Pad tok_embeddings and output weights, making their shape divisible by TP # noqa: E501 - pad_size = (_vocab_size + tp - 1) // tp * tp - _vocab_size - # Pad weight at the bottom of dim 0 - model_params['tok_embeddings.weight'] = torch.nn.functional.pad( - tok_embeddings, (0, 0, 0, pad_size), 'constant', 0) - # Pad output weight at the bottom of dim 0 - model_params['output.weight'] = torch.nn.functional.pad( - model_params['output.weight'], (0, 0, 0, pad_size), 'constant', 0) - - # reverse the splitting axes since the weights are transposed above - for param_name, param_data in model_params.items(): - split_dim = None - key, ext = param_name.split('.')[-2:] - if key == 'w_qkv' and ext == 'bias': - attn_bias = True - copy = False - if key in ['w1', 'w3', 'w13', 'w_qkv']: - split_dim = -1 - # TODO: move parameter extraction outside of the loop - if key == 'w1': - inter_size = max(inter_size, param_data.shape[-1]) - elif key == 'w13': - inter_size = max(inter_size, param_data.shape[-1] // 2) - elif key in ['w2', 'wo']: - if ext in ['bias']: - copy = True - else: - split_dim = 0 - if split_dim is not None: - print(f'*** splitting {param_name}, shape={param_data.shape}, ' - f'split_dim={split_dim}') - assert param_data.shape[split_dim] % tp == 0 - split_size = param_data.shape[split_dim] // tp - splits = torch.split(param_data, split_size, dim=split_dim) - for i, split in enumerate(splits): - prefix, ext = osp.splitext(param_name) - save_bin(split, f'{prefix}.{i}{ext}') - elif copy: - print(f'### copying {param_name}, shape={param_data.shape}') - copies = [param_data] * tp - for i, copy in enumerate(copies): - prefix, ext = osp.splitext(param_name) - save_bin(copy, f'{prefix}.{i}{ext}') - else: - save_bin(param_data, param_name) - - assert inter_size > 0 - - # export config and save it to {out_dir}/config.ini - model = MODELS.get(model_name)() - vocab_size, bos_id, eos_id = tokenizer_info(tokenizer_path) - assert _vocab_size >= vocab_size, \ - f'different vocab size {_vocab_size} vs {vocab_size}' - cfg = dict(llama=dict( - model_name=model_name, - head_num=head_num, - kv_head_num=kv_head_num, - size_per_head=size_per_head, - vocab_size=_vocab_size, - num_layer=num_layer, - rotary_embedding=size_per_head, - rope_theta=rope_theta, - inter_size=inter_size, - norm_eps=norm_eps, - attn_bias=int(attn_bias), - start_id=bos_id, - end_id=eos_id, - weight_type=weight_type, - group_size=group_size, - # parameters for turbomind - max_batch_size=32, - max_context_token_num=4, - session_len=model.session_len + 8, - step_length=1, - cache_max_entry_count=48, - cache_chunk_size=1, - use_context_fmha=1, - quant_policy=0, - tensor_para_size=tp, - # extra attention params - max_position_embeddings=max_position_embeddings, - use_dynamic_ntk=int(use_dynamic_ntk), - use_logn_attn=int(use_logn_attn), - )) - - config = configparser.ConfigParser() - for section, key_values in cfg.items(): - config[section] = key_values - - config_path = osp.join(out_dir, 'config.ini') - with open(config_path, 'w') as f: - config.write(f) - return True - - -def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int, - dim: int): - - def reshape(x): - return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1) - - qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1) - - # (input_dim, head_num + 2 * kv_head_num) - return qkv.view(q.size(0), -1) - - -def deploy_llama(model_name: str, model_path: str, tokenizer_path: str, - triton_models_path: str, tp: int): - """Deploy a model with huggingface transformers' format. - - Args: - model_name (str): the name of the to-be-deployed model - model_path (str): the path of the directory where the model weight - files are - tokenizer_path (str): the path of the tokenizer model path - triton_models_path (str): the path of the exported triton models - tp (int): the number of tensor parallelism - """ - if osp.exists(tokenizer_path): - shutil.copy(tokenizer_path, - osp.join(triton_models_path, 'tokenizer/tokenizer.model')) - with get_package_root_path() as root_path: - shutil.copy(osp.join(root_path, 'tokenizer.py'), - osp.join(triton_models_path, 'tokenizer')) - else: - print(f'tokenizer model {tokenizer_path} does not exist') - return False - # read model arguments from params.json - try: - params_path = osp.join(model_path, 'params.json') - with open(params_path) as f: - model_arg = json.load(f) - num_layer = model_arg['n_layers'] - norm_eps = model_arg['norm_eps'] - head_num = model_arg.get('n_heads', 32) - kv_head_num = model_arg.get('n_kv_heads', head_num) - except Exception as e: - print(f'get "n_layers" and "norm_eps" from {params_path} failed: {e}') - return False - - # convert weights from llama to turbomind format - checkpoints = [] - for pattern in ['*.pth', '*.pt']: - checkpoints += sorted(Path(model_path).glob(pattern)) - print(checkpoints) - n_ckpt = len(checkpoints) - model_params = {} - - def get_param(_name, _size): - print(_name, _size) - if _name not in model_params: - model_params[_name] = torch.zeros(_size, - dtype=torch.float16, - device='cpu') - return model_params[_name] - - for i, ckpt_path in enumerate(checkpoints): - ckpt = torch.load(ckpt_path, map_location='cpu') - for param_name, param_data in ckpt.items(): - key, ext = param_name.split('.')[-2:] - # column-parallel - if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']: - size = param_data.size(0) - if ext == 'weight': - param = get_param( - param_name, - [size * n_ckpt, param_data.size(1)]) - param.data[size * i:size * (i + 1), :] = param_data - else: # bias - param = get_param(param_name, [size * n_ckpt]) - param.data[size * i:size * (i + 1)] = param_data - # row-parallel - elif key in ['w2', 'wo', 'tok_embeddings']: - size = param_data.size(-1) - if ext == 'weight': - param = get_param(param_name, - [param_data.size(0), size * n_ckpt]) - param.data[:, size * i:size * (i + 1)] = param_data - else: # bias - param = get_param(param_name, [size]) - param.data = param_data - elif i == 0: - param = get_param(param_name, param_data.size()) - param.data = param_data - del ckpt - - for name, param in model_params.items(): - # transpose all weights as TurboMind is expecting column-major - # weights: (output_dims, input_dims) -> (input_dims, output_dims) - key = name.split('.')[-2] - if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']: - param.data = param.data.t() - - # concat qkv projection - for t in ['weight', 'bias']: - for i in range(1000): - _qkv = [ - f'layers.{i}.attention.{k}.{t}' for k in ['wq', 'wk', 'wv'] - ] - try: - qkv = tuple(map(model_params.pop, _qkv)) - except KeyError: - break - # concat by heads - qkv = merge_qkv(*qkv, tp, dim=2 if t == 'weight' else 1) - print(f'layers.{i}.attention.w_qkv.{t}', qkv.shape) - model_params[f'layers.{i}.attention.w_qkv.{t}'] = qkv - - assert i == 0 or num_layer == i, f'miss matched layers: {num_layer} vs {i}' - - return export(model_name, num_layer, norm_eps, kv_head_num, model_params, - tokenizer_path, triton_models_path, tp) - - -def permute(x: torch.Tensor): - SIZE_PER_HEAD = 128 - if x.shape[-1] > 1: - dim = x.shape[-1] - n_heads = dim // SIZE_PER_HEAD - return x.view(-1, n_heads, 2, - dim // n_heads // 2).transpose(2, 3).reshape(-1, dim) - else: # scales, zeros - dim = x.shape[0] - n_heads = dim // SIZE_PER_HEAD - return x.view(n_heads, 2, dim // n_heads // 2, - 1).transpose(1, 2).reshape(dim, 1) - - -def deploy_hf(model_name: str, model_path: str, tokenizer_path: str, - triton_models_path: str, tp: int): - """Deploy a model with huggingface transformers' format. - - Args: - model_name (str): the name of the to-be-deployed model - model_path (str): the path of the directory where the model weight - files are - tokenizer_path (str): the path of the tokenizer model path - triton_models_path (str): the path of the exported triton models - tp (int): the number of tensor parallelism - """ - if tokenizer_path is None: - tokenizer_path = osp.join(model_path, 'tokenizer.model') - if osp.exists(tokenizer_path): - shutil.copy(tokenizer_path, - osp.join(triton_models_path, 'tokenizer/tokenizer.model')) - for _file in os.listdir(model_path): - if _file.endswith('.json') or _file.endswith('.py'): - json_path = osp.join(model_path, _file) - shutil.copy(json_path, - osp.join(triton_models_path, 'tokenizer', _file)) - with get_package_root_path() as root_path: - shutil.copy(osp.join(root_path, 'tokenizer.py'), - osp.join(triton_models_path, 'tokenizer')) - else: - print(f'tokenizer model {tokenizer_path} does not exist') - exit(-1) - - # read model arguments from params.json - try: - params_path = osp.join(model_path, 'config.json') - with open(params_path) as f: - model_arg = json.load(f) - num_layer = model_arg['num_hidden_layers'] - norm_eps = model_arg['rms_norm_eps'] - rope_theta = float(model_arg.get('rope_theta', 10000.0)) - max_position_embeddings = int( - model_arg.get('max_position_embeddings', 0)) - repo_scaling = bool(model_arg.get('rope_scaling', False)) - if 'num_key_value_heads' in model_arg: - kv_head_num = model_arg['num_key_value_heads'] - else: - kv_head_num = model_arg['num_attention_heads'] - except Exception as e: - print(f'get "num_hidden_layers" and "rms_norm_eps" from ' - f'{params_path} failed: {e}') - return False - - # convert weights from hf to turbomind - model_params = {} - - _qweight = 'weight' - _suffixes = [_qweight, 'bias'] - - _params = load_checkpoint(model_path) - - def get_tensor(name): - """return tensor according its name.""" - return _params[name] - - def get_tensor_transposed(name: str): - """return a transposed tensor according its name.""" - if name not in _params and name.find('bias'): - return None - return _params[name].t() - - w_pack = False - if 'model.layers.0.self_attn.W_pack.weight' in _params: - w_pack = True - - for i in range(1000): - try: - # attention weights - for suffix in _suffixes: - if w_pack: - _qkvo = [ - f'model.layers.{i}.self_attn.{t}' - for t in ['W_pack', 'o_proj'] - ] - qkv, o = map(get_tensor_transposed, - map(('{}.' + suffix).format, _qkvo)) - - if qkv is None: - continue - _shape = qkv.shape[1] // 3 - _qkv = torch.split(qkv, [_shape, _shape, _shape], dim=1) - q = _qkv[0] - k = _qkv[1] - v = _qkv[2] - - else: - _qkvo = [ - f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo' - ] - q, k, v, o = map(get_tensor_transposed, - map(('{}.' + suffix).format, _qkvo)) - if q is None: - continue - # q, k has different layout for fb & hf, convert to fb's - # layout - q = permute(q) - k = permute(k) - if suffix == _qweight: # weight, qweight - qkv = merge_qkv(q, k, v, tp, dim=2) - print(suffix, qkv.shape) - else: # scales, zeros, bias - qkv = merge_qkv(q, k, v, tp, dim=1) - print(suffix, qkv.shape) - for k, v in [('w_qkv', qkv), ('wo', o)]: - model_params[f'layers.{i}.attention.{k}.{suffix}'] = v - # ffn weights - _w123 = [ - f'model.layers.{i}.mlp.{t}_proj' - for t in ['gate', 'down', 'up'] - ] - for suffix in _suffixes: - w1, w2, w3 = map(get_tensor_transposed, - map(('{}.' + suffix).format, _w123)) - if w1 is None: - continue - if suffix in ['scales', 'zeros', 'bias']: - w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3]) - for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]: - model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v - other = [('attention_norm.weight', 'input_layernorm.weight'), - ('ffn_norm.weight', 'post_attention_layernorm.weight')] - for ft, hf in other: - model_params[f'layers.{i}.' + - ft] = get_tensor(f'model.layers.{i}.' + hf) - except safetensors.SafetensorError: - break - except KeyError: - break - - assert num_layer == i, f'miss matched layers: {num_layer} vs {i}' - - other = [('tok_embeddings.weight', 'model.embed_tokens.weight'), - ('norm.weight', 'model.norm.weight'), - ('output.weight', 'lm_head.weight')] - for ft, hf in other: - model_params[ft] = get_tensor(hf) - - if model_name == 'baichuan2-7b': - # https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/modeling_baichuan.py#L507 - # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507 - model_params['output.weight'] = torch.nn.functional.normalize( - model_params['output.weight']) - - return export(model_name, - num_layer, - norm_eps, - kv_head_num, - model_params, - tokenizer_path, - triton_models_path, - tp, - max_position_embeddings=max_position_embeddings, - use_dynamic_ntk=repo_scaling, - rope_theta=rope_theta) - - -def deploy_awq(model_name: str, model_path: str, tokenizer_path: str, - triton_models_path: str, tp: int, quant_path: str, - group_size: int): - """Deploy a model with huggingface transformers' format. - - Args: - model_name (str): the name of the to-be-deployed model - model_path (str): the path of the directory where the model weight - files are - tokenizer_path (str): the path of the tokenizer model path - triton_models_path (str): the path of the exported triton models - tp (int): the number of tensor parallelism - quant_path (str): path of the quantized model, which can be None - group_size (int): a parameter used in AWQ to quantize fp16 weights - to 4 bits - """ - if tokenizer_path is None: - tokenizer_path = osp.join(model_path, 'tokenizer.model') - if osp.exists(tokenizer_path): - shutil.copy(tokenizer_path, - osp.join(triton_models_path, 'tokenizer/tokenizer.model')) - for _file in os.listdir(model_path): - if _file.endswith('.json') or _file.endswith('.py'): - json_path = osp.join(model_path, _file) - shutil.copy(json_path, - osp.join(triton_models_path, 'tokenizer', _file)) - with get_package_root_path() as root_path: - shutil.copy(osp.join(root_path, 'tokenizer.py'), - osp.join(triton_models_path, 'tokenizer')) - else: - print(f'tokenizer model {tokenizer_path} does not exist') - exit(-1) - - # read model arguments from params.json - try: - params_path = osp.join(model_path, 'config.json') - with open(params_path) as f: - model_arg = json.load(f) - num_layer = model_arg['num_hidden_layers'] - norm_eps = model_arg['rms_norm_eps'] - rope_theta = float(model_arg.get('rope_theta', 10000.0)) - if 'num_key_value_heads' in model_arg: - kv_head_num = model_arg['num_key_value_heads'] - else: - kv_head_num = model_arg['num_attention_heads'] - except Exception as e: - print(f'get "num_hidden_layers" and "rms_norm_eps" from ' - f'{params_path} failed: {e}') - return False - - # convert weights from hf to turbomind - if quant_path is None: - _files = [ - osp.join(model_path, file) for file in os.listdir(model_path) - if file.endswith('.bin') - ] - _files = sorted(_files) - else: - _files = [quant_path] - - model_params = {} - - _params = {} - for _file in _files: - _tmp = torch.load(_file, map_location='cpu') - _params.update(_tmp) - - def get_tensor(name): - """return tensor according its name.""" - return _params[name].cuda().contiguous() - - # import _turbomind as _tm - # TODO: find another way import _turbomind - lmdeploy_dir = osp.split(lmdeploy.__file__)[0] - sys.path.append(osp.join(lmdeploy_dir, 'lib')) - import _turbomind as _tm # noqa: E402 - - def transpose_qk_s4(src: torch.Tensor): - assert src.is_contiguous() - dst = torch.zeros_like(src) - _tm.transpose_qk_s4_k_m8(src, dst, - src.size(-1) * 8, src.size(0), group_size) - return dst - - def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, - w1_s: torch.Tensor, w3_qw: torch.Tensor, - w3_qz: torch.Tensor, w3_s: torch.Tensor): - - def fuse(a: torch.Tensor, b: torch.Tensor): - ab = torch.cat((a, b)).contiguous() - _ab = torch.zeros_like(ab) - _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0)) - return _ab.view(a.size(0), -1) - - w13_qw = fuse(w1_qw, w3_qw) - w13_qz = fuse(w1_qz, w3_qz) - - w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1) - w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1) - - return w13_qw, w13_qz, w13_s - - def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor, - group_size: int): - assert qw.is_contiguous() - assert qz.is_contiguous() - assert s.is_contiguous() - _qw = torch.zeros_like(qw) - _sz = torch.zeros_like(s, dtype=torch.int32) # half2 - _ws = torch.zeros_like(s) - _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz, - qw.size(-1) * 8, qw.size(0), group_size) - return _qw, _sz - - def tp_m_s4(x: torch.Tensor, tp: int): - return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3, - 1).contiguous() - - attn_bias = False - - for i in range(num_layer): - print(i) - - # attention weights - q_qw = get_tensor(f'model.layers.{i}.self_attn.q_proj.qweight') - k_qw = get_tensor(f'model.layers.{i}.self_attn.k_proj.qweight') - v_qw = get_tensor(f'model.layers.{i}.self_attn.v_proj.qweight') - o_qw = get_tensor(f'model.layers.{i}.self_attn.o_proj.qweight') - - q_qz = get_tensor(f'model.layers.{i}.self_attn.q_proj.qzeros') - k_qz = get_tensor(f'model.layers.{i}.self_attn.k_proj.qzeros') - v_qz = get_tensor(f'model.layers.{i}.self_attn.v_proj.qzeros') - o_qz = get_tensor(f'model.layers.{i}.self_attn.o_proj.qzeros') - - q_s = get_tensor(f'model.layers.{i}.self_attn.q_proj.scales') - k_s = get_tensor(f'model.layers.{i}.self_attn.k_proj.scales') - v_s = get_tensor(f'model.layers.{i}.self_attn.v_proj.scales') - o_s = get_tensor(f'model.layers.{i}.self_attn.o_proj.scales') - - try: - q_b = get_tensor(f'model.layers.{i}.self_attn.q_proj.bias') - k_b = get_tensor(f'model.layers.{i}.self_attn.k_proj.bias') - v_b = get_tensor(f'model.layers.{i}.self_attn.v_proj.bias') - o_b = get_tensor(f'model.layers.{i}.self_attn.o_proj.bias') - attn_bias = True - except: # noqa: E722 - pass - - q_qw = transpose_qk_s4(q_qw) - k_qw = transpose_qk_s4(k_qw) - q_qz = transpose_qk_s4(q_qz) - k_qz = transpose_qk_s4(k_qz) - q_s = permute(q_s) - k_s = permute(k_s) - - qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2) - qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2) - qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2) - - qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size) - - qkv_qw = tp_m_s4(qkv_qw, tp) - - model_params[f'layers.{i}.attention.w_qkv.qweight'] = qkv_qw - model_params[f'layers.{i}.attention.w_qkv.scales_zeros'] = qkv_sz - - o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size) - - model_params[f'layers.{i}.attention.wo.qweight'] = o_qw - model_params[f'layers.{i}.attention.wo.scales_zeros'] = o_sz - - if attn_bias: - q_b = permute(q_b) - k_b = permute(k_b) - qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1) - model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b - model_params[f'layers.{i}.attention.wo.bias'] = o_b - - # ffn weights - w1_qw = get_tensor(f'model.layers.{i}.mlp.gate_proj.qweight') - w2_qw = get_tensor(f'model.layers.{i}.mlp.down_proj.qweight') - w3_qw = get_tensor(f'model.layers.{i}.mlp.up_proj.qweight') - - w1_qz = get_tensor(f'model.layers.{i}.mlp.gate_proj.qzeros') - w2_qz = get_tensor(f'model.layers.{i}.mlp.down_proj.qzeros') - w3_qz = get_tensor(f'model.layers.{i}.mlp.up_proj.qzeros') - - w1_s = get_tensor(f'model.layers.{i}.mlp.gate_proj.scales') - w2_s = get_tensor(f'model.layers.{i}.mlp.down_proj.scales') - w3_s = get_tensor(f'model.layers.{i}.mlp.up_proj.scales') - - w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz, - w3_s) - - w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size) - w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size) - - w13_qw = tp_m_s4(w13_qw, tp) - - model_params[f'layers.{i}.feed_forward.w13.qweight'] = w13_qw - model_params[f'layers.{i}.feed_forward.w13.scales_zeros'] = w13_sz - - model_params[f'layers.{i}.feed_forward.w2.qweight'] = w2_qw - model_params[f'layers.{i}.feed_forward.w2.scales_zeros'] = w2_sz - - # norm weights - attn_norm = get_tensor(f'model.layers.{i}.input_layernorm.weight') - ffn_norm = get_tensor( - f'model.layers.{i}.post_attention_layernorm.weight') - - model_params[f'layers.{i}.attention_norm.weight'] = attn_norm - model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm - - other = [('tok_embeddings.weight', 'model.embed_tokens.weight'), - ('norm.weight', 'model.norm.weight'), - ('output.weight', 'lm_head.weight')] - for ft, hf in other: - model_params[ft] = get_tensor(hf) - - return export(model_name, - num_layer, - norm_eps, - kv_head_num, - model_params, - tokenizer_path, - triton_models_path, - tp, - weight_type='int4', - group_size=group_size, - rope_theta=rope_theta) - - -def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str, - triton_models_path: str, tp: int): - """Deploy a model with huggingface transformers' format. - - Args: - model_name (str): the name of the to-be-deployed model - model_path (str): the path of the directory where the model weight - files are - tokenizer_path (str): the path of the tokenizer model path - triton_models_path (str): the path of the exported triton models - tp (int): the number of tensor parallelism - quant_path (str): path of the quantized model, which can be None - group_size (int): a parameter used in AWQ to quantize fp16 weights - to 4 bits - """ - - if osp.exists(model_path): - shutil.copy(osp.join(model_path, 'qwen.tiktoken'), - osp.join(triton_models_path, 'tokenizer')) - for _file in os.listdir(model_path): - if _file.endswith('.json') or _file.endswith('.py'): - json_path = osp.join(model_path, _file) - shutil.copy(json_path, - osp.join(triton_models_path, 'tokenizer', _file)) - with get_package_root_path() as root_path: - shutil.copy(osp.join(root_path, 'tokenizer.py'), - osp.join(triton_models_path, 'tokenizer')) - else: - print(f'tokenizer model {tokenizer_path} does not exist') - exit(-1) - - # read model arguments from params.json - try: - params_path = osp.join(model_path, 'config.json') - with open(params_path) as f: - config = json.load(f) - num_layer = config['num_hidden_layers'] - norm_eps = config['layer_norm_epsilon'] - rope_theta = float(config.get('rotary_emb_base', 10000.0)) - if 'num_key_value_heads' in config: - kv_head_num = config['num_key_value_heads'] - else: - kv_head_num = config['num_attention_heads'] - seq_length = config['seq_length'] - use_dynamic_ntk = config['use_dynamic_ntk'] - use_logn_attn = config['use_logn_attn'] - except Exception as e: - print(f'get "num_hidden_layers" and "layer_norm_epsilon" from ' - f'{params_path} failed: {e}') - return False - - # convert weights from hf to turbomind - model_params = {} - - _params = load_checkpoint(model_path) - - def get_tensor(name, trans=True): - """return a transposed tensor according its name.""" - if trans: - return _params[name].cuda().t() - else: - return _params[name].cuda() - - for i in range(num_layer): - print(i) - - # qkv weights - qkv_w = get_tensor(f'transformer.h.{i}.attn.c_attn.weight') - q_w, k_w, v_w = torch.split(qkv_w, qkv_w.size(-1) // 3, dim=-1) - q_w, k_w = permute(q_w), permute(k_w) - qkv_w = merge_qkv(q_w, k_w, v_w, tp, dim=2) - model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv_w - - # qkv bias - qkv_b = get_tensor(f'transformer.h.{i}.attn.c_attn.bias') - q_b, k_b, v_b = torch.split(qkv_b, qkv_b.size(-1) // 3) - q_b, k_b = permute(q_b), permute(k_b) - qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1) - model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b - - # o weights - o_w = get_tensor(f'transformer.h.{i}.attn.c_proj.weight') - model_params[f'layers.{i}.attention.wo.weight'] = o_w - model_params[f'layers.{i}.attention.wo.bias'] = torch.zeros_like(q_b) - - # ffn weights - # ours: w2(silu(w1(x)) * w3(x)) - # qwen: c_proj(w1(x) * silu(w2(x))) - w1 = get_tensor(f'transformer.h.{i}.mlp.w2.weight') - w3 = get_tensor(f'transformer.h.{i}.mlp.w1.weight') - w2 = get_tensor(f'transformer.h.{i}.mlp.c_proj.weight') - model_params[f'layers.{i}.feed_forward.w1.weight'] = w1 - model_params[f'layers.{i}.feed_forward.w2.weight'] = w2 - model_params[f'layers.{i}.feed_forward.w3.weight'] = w3 - - # norm weights - attn_norm = get_tensor(f'transformer.h.{i}.ln_1.weight') - ffn_norm = get_tensor(f'transformer.h.{i}.ln_2.weight') - - model_params[f'layers.{i}.attention_norm.weight'] = attn_norm - model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm - - other = [('tok_embeddings.weight', 'transformer.wte.weight'), - ('norm.weight', 'transformer.ln_f.weight'), - ('output.weight', 'lm_head.weight')] - for ft, hf in other: - model_params[ft] = get_tensor(hf, trans=False) - - return export(model_name, - num_layer, - norm_eps, - kv_head_num, - model_params, - model_path, - triton_models_path, - tp, - max_position_embeddings=seq_length, - use_dynamic_ntk=use_dynamic_ntk, - use_logn_attn=use_logn_attn, - rope_theta=rope_theta, - tokenizer_info=tokenizer_info_qwen) - - -def pack_model_repository(workspace_path: str): - """package the model repository. - - Args: - workspace_path: the path of workspace - """ - os.symlink(src='../../tokenizer', - dst=osp.join(workspace_path, 'triton_models', 'preprocessing', - '1', 'tokenizer')) - os.symlink(src='../../tokenizer', - dst=osp.join(workspace_path, 'triton_models', 'postprocessing', - '1', 'tokenizer')) - os.symlink(src='../../weights', - dst=osp.join(workspace_path, 'triton_models', 'interactive', - '1', 'weights')) - model_repo_dir = osp.join(workspace_path, 'model_repository') - os.makedirs(model_repo_dir, exist_ok=True) - os.symlink(src=osp.join('../triton_models/interactive'), - dst=osp.join(model_repo_dir, 'turbomind')) - os.symlink(src=osp.join('../triton_models/preprocessing'), - dst=osp.join(model_repo_dir, 'preprocessing')) - os.symlink(src=osp.join('../triton_models/postprocessing'), - dst=osp.join(model_repo_dir, 'postprocessing')) - - -def main(model_name: str, - model_path: str, - model_format: str = None, - tokenizer_path: str = None, - dst_path: str = './workspace', - tp: int = 1, - quant_path: str = None, - group_size: int = 0): - """deploy llama family models via turbomind. - - Args: - model_name (str): the name of the to-be-deployed model, such as - llama-7b, llama-13b, vicuna-7b and etc - model_path (str): the directory path of the model - model_format (str): the format of the model, fb or hf. 'fb' stands for - META's llama format, and 'hf' means huggingface format - tokenizer_path (str): the path of tokenizer model - dst_path (str): the destination path that saves outputs - tp (int): the number of GPUs used for tensor parallelism, should be 2^n - quant_path (str): path of the quantized model, which can be None - group_size (int): a parameter used in AWQ to quantize fp16 weights - to 4 bits - """ - assert model_name in MODELS.module_dict.keys(), \ - f"'{model_name}' is not supported. " \ - f'The supported models are: {MODELS.module_dict.keys()}' - - assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n' - - if model_format is None: - model_format = 'qwen' if model_name == 'qwen-7b' else 'hf' - - if model_format not in supported_formats: - print(f'the model format "{model_format}" is not supported. ' - f'The supported format are: {supported_formats}') - exit(-1) - - if model_format == 'llama' and tokenizer_path is None: - print('The model is llama. Its tokenizer model path should be ' - 'specified') - exit(-1) - - if not create_workspace(dst_path): - exit(-1) - - triton_models_path = copy_triton_model_templates(dst_path) - if triton_models_path is None: - exit(-1) - - if model_format == 'llama': - res = deploy_llama(model_name, model_path, tokenizer_path, - triton_models_path, tp) - elif model_format == 'hf': - res = deploy_hf(model_name, model_path, tokenizer_path, - triton_models_path, tp) - elif model_format == 'awq': - res = deploy_awq(model_name, model_path, tokenizer_path, - triton_models_path, tp, quant_path, group_size) - elif model_format == 'qwen': - res = deploy_qwen(model_name, model_path, tokenizer_path, - triton_models_path, tp) - - # update `tensor_para_size` in `triton_models/interactive/config.pbtxt` - with open(osp.join(triton_models_path, 'interactive/config.pbtxt'), - 'a') as f: - param = \ - 'parameters {\n key: "tensor_para_size"\n value: {\n ' \ - 'string_value: ' + f'"{tp}"\n' + ' }\n}\n' + \ - 'parameters {\n key: "model_name"\n value: {\n ' \ - 'string_value: ' + f'"{model_name}"\n' + ' }\n}\n' - f.write(param) - if not res: - print(f'deploy model "{model_name}" via turbomind failed') - destroy_workspace(dst_path) - exit(-1) - - # pack model repository for triton inference server - pack_model_repository(dst_path) - - # update the value of $TP in `service_docker_up.sh` - file_path = osp.join(dst_path, 'service_docker_up.sh') - with open(file_path, 'r') as f: - content = f.read() - content = re.sub('TP=1', f'TP={tp}', content) - with open(file_path, 'w') as f: - f.write(content) - - -if __name__ == '__main__': - fire.Fire(main) diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index 296d453ed4..231601fde0 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json +import os import os.path as osp from typing import Optional, Sequence, Union @@ -16,7 +17,7 @@ class SentencePieceTokenizer: def __init__(self, model_file: str): from sentencepiece import SentencePieceProcessor self.model = SentencePieceProcessor(model_file=model_file) - self._no_prefix_space_tokens = None + self._prefix_space_tokens = None @property def vocab_size(self): @@ -34,19 +35,20 @@ def eos_token_id(self): return self.model.eos_id() @property - def no_prefix_space_tokens(self): + def prefix_space_tokens(self): """tokens without prefix space.""" - if self._no_prefix_space_tokens is None: + if self._prefix_space_tokens is None: vocab = self.model.IdToPiece(list(range(self.vocab_size))) - self._no_prefix_space_tokens = { + self._prefix_space_tokens = { i - for i, tok in enumerate(vocab) if not tok.startswith('▁') + for i, tok in enumerate(vocab) if tok.startswith('▁') } - return self._no_prefix_space_tokens + return self._prefix_space_tokens def _maybe_add_prefix_space(self, tokens, decoded): """maybe add prefix space for incremental decoding.""" - if len(tokens) and tokens[0] not in self.no_prefix_space_tokens: + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: return ' ' + decoded else: return decoded @@ -111,8 +113,7 @@ class HuggingFaceTokenizer: """ def __init__(self, model_dir: str, trust_remote_code=True): - from transformers import (AutoTokenizer, CodeLlamaTokenizerFast, - LlamaTokenizerFast) + from transformers import AutoTokenizer model_file = osp.join(model_dir, 'tokenizer.model') backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json') model_file_exists = osp.exists(model_file) @@ -121,20 +122,22 @@ def __init__(self, model_dir: str, trust_remote_code=True): 'It may take long time to initialize the tokenizer.') self.model = AutoTokenizer.from_pretrained( model_dir, trust_remote_code=trust_remote_code) - self.need_padding = isinstance(self.model, LlamaTokenizerFast) \ - or isinstance(self.model, CodeLlamaTokenizerFast) - self._no_prefix_space_tokens = None + self._prefix_space_tokens = None # save tokenizer.json to reuse if not osp.exists(backend_tokenizer_file) and model_file_exists: if hasattr(self.model, 'backend_tokenizer'): - self.model.backend_tokenizer.save(backend_tokenizer_file) + if os.access(model_dir, os.W_OK): + self.model.backend_tokenizer.save(backend_tokenizer_file) if self.model.eos_token_id is None: generation_config_file = osp.join(model_dir, 'generation_config.json') - with open(generation_config_file, 'r') as f: - cfg = json.load(f) - self.model.eos_token_id = cfg['eos_token_id'] + if osp.exists(generation_config_file): + with open(generation_config_file, 'r') as f: + cfg = json.load(f) + self.model.eos_token_id = cfg['eos_token_id'] + elif hasattr(self.model, 'eod_id'): # Qwen remote + self.model.eos_token_id = self.model.eod_id @property def vocab_size(self): @@ -152,21 +155,22 @@ def eos_token_id(self): return self.model.eos_token_id @property - def no_prefix_space_tokens(self): + def prefix_space_tokens(self): """tokens without prefix space.""" - if self._no_prefix_space_tokens is None: + if self._prefix_space_tokens is None: vocab = self.model.convert_ids_to_tokens( list(range(self.vocab_size))) - self._no_prefix_space_tokens = { + self._prefix_space_tokens = { i - for i, tok in enumerate(vocab) if not tok.startswith('▁') + for i, tok in enumerate(vocab) + if tok.startswith('▁' if isinstance(tok, str) else b' ') } - return self._no_prefix_space_tokens + return self._prefix_space_tokens def _maybe_add_prefix_space(self, tokens, decoded): """maybe add prefix space for incremental decoding.""" - if self.need_padding and len( - tokens) and tokens[0] not in self.no_prefix_space_tokens: + if len(tokens) and not decoded.startswith(' ') and\ + tokens[0] in self.prefix_space_tokens: return ' ' + decoded else: return decoded diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py index de31a5daa7..8091dd29b4 100644 --- a/lmdeploy/turbomind/chat.py +++ b/lmdeploy/turbomind/chat.py @@ -4,12 +4,6 @@ import os.path as osp import random -import fire - -from lmdeploy import turbomind as tm -from lmdeploy.model import MODELS -from lmdeploy.tokenizer import Tokenizer - os.environ['TM_LOG_LEVEL'] = 'ERROR' @@ -73,9 +67,9 @@ def get_gen_param(cap, def main(model_path, session_id: int = 1, cap: str = 'chat', - sys_instruct: str = None, - tp=1, - stream_output=True, + tp: int = 1, + stream_output: bool = True, + request_output_len: int = 512, **kwargs): """An example to perform model inference through the command line interface. @@ -85,24 +79,27 @@ def main(model_path, session_id (int): the identical id of a session cap (str): the capability of a model. For example, codellama has the ability among ['completion', 'infilling', 'chat', 'python'] - sys_instruct (str): the content of 'system' role, which is used by - conversational model tp (int): GPU number used in tensor parallelism stream_output (bool): indicator for streaming output or not **kwarg (dict): other arguments for initializing model's chat template """ + from lmdeploy import turbomind as tm + from lmdeploy.tokenizer import Tokenizer + tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer') tokenizer = Tokenizer(tokenizer_model_path) - tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp) + tm_model = tm.TurboMind(model_path, + eos_id=tokenizer.eos_token_id, + tp=tp, + capability=cap, + **kwargs) generator = tm_model.create_instance() nth_round = 1 step = 0 seed = random.getrandbits(64) model_name = tm_model.model_name - model = MODELS.get(model_name)(capability=cap, **kwargs) \ - if sys_instruct is None else MODELS.get(model_name)( - capability=cap, system=sys_instruct, **kwargs) + model = tm_model.model print(f'session {session_id}') while True: @@ -112,12 +109,13 @@ def main(model_path, elif prompt == 'end': prompt = model.get_prompt('', nth_round == 1) input_ids = tokenizer.encode(prompt) - for outputs in generator.stream_infer(session_id=session_id, - input_ids=[input_ids], - request_output_len=512, - sequence_start=False, - sequence_end=True, - stream_output=stream_output): + for outputs in generator.stream_infer( + session_id=session_id, + input_ids=[input_ids], + request_output_len=request_output_len, + sequence_start=False, + sequence_end=True, + stream_output=stream_output): pass nth_round = 1 step = 0 @@ -125,13 +123,14 @@ def main(model_path, else: prompt = model.get_prompt(prompt, nth_round == 1) input_ids = tokenizer.encode(prompt) - if step + len(input_ids) >= tm_model.session_len: + if step + len( + input_ids) + request_output_len >= tm_model.session_len: print('WARNING: exceed session max length.' ' Please end the session.') continue gen_param = get_gen_param(cap, model.sampling_param, nth_round, - step, **kwargs) + step, request_output_len, **kwargs) print(f'{prompt} ', end='', flush=True) response_size = 0 @@ -145,6 +144,11 @@ def main(model_path, res, tokens = outputs[0] # decode res response = tokenizer.decode(res.tolist(), offset=response_size) + # utf-8 char at the end means it's a potential unfinished + # byte sequence, continue to concate it with the next + # sequence and decode them together + if response.endswith('�'): + continue response = valid_str(response) print(f'{response}', end='', flush=True) response_size = tokens @@ -157,4 +161,6 @@ def main(model_path, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/decode.py b/lmdeploy/turbomind/decode.py index daef35298c..5ba4675c59 100644 --- a/lmdeploy/turbomind/decode.py +++ b/lmdeploy/turbomind/decode.py @@ -2,7 +2,6 @@ import os import os.path as osp -import fire import torch from lmdeploy import turbomind as tm @@ -37,4 +36,6 @@ def main(model_path, inputs): if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/deploy/__init__.py b/lmdeploy/turbomind/deploy/__init__.py new file mode 100644 index 0000000000..ef101fec61 --- /dev/null +++ b/lmdeploy/turbomind/deploy/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py new file mode 100644 index 0000000000..4876002020 --- /dev/null +++ b/lmdeploy/turbomind/deploy/converter.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import re +import shutil +from pathlib import Path + +import fire + +from lmdeploy.model import MODELS + +from .source_model.base import INPUT_MODELS +from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig + +supported_formats = ['llama', 'hf', 'awq', None] +special_input_model_map = { + 'qwen': 'qwen', + 'baichuan': 'baichuan', + 'baichuan2': 'baichuan2' +} + + +def get_package_root_path(): + """Get lmdeploy root path.""" + import lmdeploy + return Path(lmdeploy.__file__).parent + + +def get_tokenizer_path(model_path: str, tokenizer_path: str): + """Get tokenizer path if not given.""" + if tokenizer_path is not None: + assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.' + return tokenizer_path + candidate = ['tokenizer.model', 'qwen.tiktoken'] + for name in candidate: + tmp_path = osp.join(model_path, name) + if osp.exists(tmp_path): + tokenizer_path = tmp_path + break + assert tokenizer_path, 'please supply tokenizer path by --tokenizer-path' + return tokenizer_path + + +def get_model_format(model_name: str, model_format: str): + """Get model format if not given or equal awq.""" + # get model name prefix + if model_name.find('-') != -1: + model_name = model_name[:model_name.find('-')] + # rules: + # 1) llama -> match special -> hf (if not matched) + # 2) append awq (if model_format is awq) + inferred_model_format = model_format + if model_format in [None, 'hf']: + inferred_model_format = special_input_model_map.get(model_name, 'hf') + elif model_format == 'awq': + inferred_model_format = special_input_model_map.get(model_name, + 'hf') + '-awq' + return inferred_model_format + + +def create_workspace(_path: str): + """Create a workspace. + + Args: + _path (str): the path of the workspace + """ + if osp.exists(_path): + print(f'remove workspace in directory {_path}') + shutil.rmtree(_path) + print(f'create workspace in directory {_path}') + os.makedirs(_path) + + +def copy_triton_model_templates(_path: str): + """copy triton model templates to the specified path. + + Args: + _path (str): the target path + Returns: + str: the path of the triton models + """ + + root = get_package_root_path() + dir_path = osp.join(root, 'serve', 'turbomind') + triton_models_path = osp.join(dir_path, 'triton_models') + dst_path = osp.join(_path, 'triton_models') + print(f'copy triton model templates from "{triton_models_path}" to ' + f'"{dst_path}"') + shutil.copytree(triton_models_path, dst_path, symlinks=True) + service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh') + print(f'copy service_docker_up.sh from "{service_docker_up_file}" to ' + f'"{_path}"') + shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path) + return dst_path + + +def copy_tokenizer(model_path: str, tokenizer_path: str, + triton_models_path: str): + """Copy tokenizer.""" + shutil.copy( + tokenizer_path, + osp.join(triton_models_path, + osp.join('tokenizer', osp.basename(tokenizer_path)))) + for _file in os.listdir(model_path): + if _file.endswith('.json') or _file.endswith('.py'): + json_path = osp.join(model_path, _file) + shutil.copy(json_path, + osp.join(triton_models_path, 'tokenizer', _file)) + with get_package_root_path() as root_path: + shutil.copy(osp.join(root_path, 'tokenizer.py'), + osp.join(triton_models_path, 'tokenizer')) + + +def pack_model_repository(workspace_path: str): + """package the model repository. + + Args: + workspace_path: the path of workspace + """ + os.symlink(src=osp.join('..', '..', 'tokenizer'), + dst=osp.join(workspace_path, 'triton_models', 'preprocessing', + '1', 'tokenizer')) + os.symlink(src=osp.join('..', '..', 'tokenizer'), + dst=osp.join(workspace_path, 'triton_models', 'postprocessing', + '1', 'tokenizer')) + os.symlink(src=osp.join('..', '..', 'weights'), + dst=osp.join(workspace_path, 'triton_models', 'interactive', + '1', 'weights')) + model_repo_dir = osp.join(workspace_path, 'model_repository') + os.makedirs(model_repo_dir, exist_ok=True) + os.symlink(src=osp.join('..', 'triton_models', 'interactive'), + dst=osp.join(model_repo_dir, 'turbomind')) + os.symlink(src=osp.join('..', 'triton_models', 'preprocessing'), + dst=osp.join(model_repo_dir, 'preprocessing')) + os.symlink(src=osp.join('..', 'triton_models', 'postprocessing'), + dst=osp.join(model_repo_dir, 'postprocessing')) + + +def main(model_name: str, + model_path: str, + model_format: str = None, + tokenizer_path: str = None, + dst_path: str = 'workspace', + tp: int = 1, + quant_path: str = None, + group_size: int = 0): + """deploy llama family models via turbomind. + + Args: + model_name (str): the name of the to-be-deployed model, such as + llama-7b, llama-13b, vicuna-7b and etc + model_path (str): the directory path of the model + model_format (str): the format of the model, should choose from + ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama + format, 'hf' means huggingface llama format, and 'awq' means + llama(hf) model quantized by lmdeploy/lite/quantization/awq.py. + the default value is None, which means the model_format will be + inferred based on model_name + tokenizer_path (str): the path of tokenizer model + dst_path (str): the destination path that saves outputs + tp (int): the number of GPUs used for tensor parallelism, should be 2^n + quant_path (str): Path of the quantized model, which can be None. + group_size (int): a parameter used in AWQ to quantize fp16 weights + to 4 bits + """ + + assert model_name in MODELS.module_dict.keys(), \ + f"'{model_name}' is not supported. " \ + f'The supported models are: {MODELS.module_dict.keys()}' + + assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n' + + output_format = 'fp16' + + # get input model format + assert model_format in supported_formats, 'the model format ' \ + f'should be in {supported_formats}' + + inferred_model_format = get_model_format(model_name, model_format) + if inferred_model_format not in INPUT_MODELS.module_dict.keys(): + supported_keys = list(INPUT_MODELS.module_dict.keys()) + print(f'with model name {model_name} and model formst {model_format}, ' + f'the inferred model format is {inferred_model_format}, ' + f'which is not in supported list {supported_keys}') + exit(-1) + + # get tokenizer path + tokenizer_path = get_tokenizer_path(model_path, tokenizer_path) + + # create workspace + create_workspace(dst_path) + + triton_models_path = copy_triton_model_templates(dst_path) + + copy_tokenizer(model_path, tokenizer_path, triton_models_path) + + # turbomind config + cfg = TurbomindModelConfig.from_dict({}, allow_none=True) + cfg.model_name = model_name + cfg.tensor_para_size = tp + cfg.rotary_embedding = cfg.size_per_head + cfg.group_size = group_size + if inferred_model_format.find('awq') != -1: + cfg.weight_type = 'int4' + output_format = 'w4' + assert group_size > 0, 'group_size should > 0' + + # convert + print('model_name ', model_name) + print('model_format ', model_format) + print('inferred_model_format ', inferred_model_format) + print('model_path ', model_path) + print('tokenizer_path ', tokenizer_path) + print('output_format ', output_format) + weight_path = osp.join(triton_models_path, 'weights') + input_model = INPUT_MODELS.get(inferred_model_format)( + model_path=model_path, + tokenizer_path=tokenizer_path, + ckpt_path=quant_path) + output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model, + cfg=cfg, + to_file=True, + out_dir=weight_path) + output_model.export() + + # update `tensor_para_size` in `triton_models/interactive/config.pbtxt` + with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'), + 'a') as f: + param = \ + 'parameters {\n key: "tensor_para_size"\n value: {\n ' \ + 'string_value: ' + f'"{tp}"\n' + ' }\n}\n' + \ + 'parameters {\n key: "model_name"\n value: {\n ' \ + 'string_value: ' + f'"{model_name}"\n' + ' }\n}\n' + f.write(param) + + # pack model repository for triton inference server + pack_model_repository(dst_path) + + # update the value of $TP in `service_docker_up.sh` + file_path = osp.join(dst_path, 'service_docker_up.sh') + with open(file_path, 'r') as f: + content = f.read() + content = re.sub('TP=1', f'TP={tp}', content) + with open(file_path, 'w') as f: + f.write(content) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py new file mode 100644 index 0000000000..7c6627c770 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401 +from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel # noqa: F401 +from .llama import LlamaModel # noqa: F401 +from .llama_awq import LlamaAwqModel # noqa: F401 +from .meta_llama import MetaLlamaModel # noqa: F401 +from .qwen import QwenModel # noqa: F401 +from .qwen_awq import QwenAwqModel # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py new file mode 100644 index 0000000000..46ccb6309d --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch + +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +class BaichuanReader(LlamaReader): + """BaichuanReader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0): + """Get q, k, v, o kind for layer i.""" + result = [] + pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}' + qkv = self.params[pack_key] + result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim)) + o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}'] + result.append(o) + return (*result, ) + + def attn(self, i: int): + """Get q, k, v, o weight for layer i.""" + return self._attn(i, 'weight', 0, 0) + + def attn_bias(self, i: int): + """Get q, k, v, o bias for layer i.""" + return (None, ) * 4 + + +class Baichuan2Reader(BaichuanReader): + """Baichuan2Reader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def output_weight(self): + """Get output.""" + # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507 + tensor = self.params.get('lm_head.weight', None) + if tensor is not None: + tensor = tensor.cuda() + tensor = torch.nn.functional.normalize(tensor) + return tensor + + +@INPUT_MODELS.register_module(name='baichuan') +class BaichuanModel(LlamaModel): + """Llama model in baichuan format.""" + + Reader = BaichuanReader + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict): + super().__init__(model_path, tokenizer_path, **kwargs) + + +@INPUT_MODELS.register_module(name='baichuan2') +class Baichuan2Model(LlamaModel): + """Llama model in baichuan format.""" + + Reader = Baichuan2Reader + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict): + super().__init__(model_path, tokenizer_path, **kwargs) diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py new file mode 100644 index 0000000000..d5d60286a8 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py @@ -0,0 +1,87 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader +from .base import INPUT_MODELS +from .llama_awq import ensure_fp16orint32 + + +class BaichuanAwqReader(BaichuanReader): + """BaichuanAwqReader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def attn(self, i: int): + """Get q, k, v, o qweight for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1)) + + def attn_zero(self, i: int): + """Get q, k, v, o qzeros for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1)) + + def attn_scale(self, i: int): + """Get q, k, v, o scales for layer i.""" + return ensure_fp16orint32(self._attn(i, 'scales', -1, -1)) + + def ffn(self, i: int): + """Get ffn qweight for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'qweight')) + + def ffn_zero(self, i: int): + """Get ffn qzeros for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'qzeros')) + + def ffn_scale(self, i: int): + """Get ffn scales for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'scales')) + + +class Baichuan2AwqReader(BaichuanAwqReader): + """Baichuan2AwqReader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def output_weight(self): + """Get output.""" + # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507 + tensor = self.params.get('lm_head.weight', None) + if tensor is not None: + tensor = tensor.cuda() + tensor = torch.nn.functional.normalize(tensor) + return tensor + + +@INPUT_MODELS.register_module(name='baichuan-awq') +class BaichuanAwqModel(BaichuanModel): + """Baichuan awq model in hf format.""" + + Reader = BaichuanAwqReader + + def __init__(self, + model_path: str, + tokenizer_path: str, + ckpt_path: str = None, + **kwargs): + super().__init__(model_path, + tokenizer_path, + ckpt_path=ckpt_path, + **kwargs) + + +@INPUT_MODELS.register_module(name='baichuan2-awq') +class Baichuan2AwqModel(Baichuan2Model): + """Baichuan2 awq model in hf format.""" + + Reader = Baichuan2AwqReader + + def __init__(self, + model_path: str, + tokenizer_path: str, + ckpt_path: str = None, + **kwargs): + super().__init__(model_path, + tokenizer_path, + ckpt_path=ckpt_path, + **kwargs) diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py new file mode 100644 index 0000000000..89f18033e9 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/base.py @@ -0,0 +1,174 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import re +from abc import ABC, abstractmethod +from typing import Dict, Iterator, Tuple, Union + +import torch +from mmengine import Registry + +INPUT_MODELS = Registry( + 'source model', locations=['lmdeploy.turbomind.deploy.source_model.base']) + + +class BaseReader(ABC): + """Base checkpoint manager.""" + + def __init__(self): + pass + + @property + @abstractmethod + def start_layer_id(self) -> int: + """Get the start transformer layer number.""" + pass + + @property + @abstractmethod + def end_layer_id(self) -> int: + """Get the end transformer layer number.""" + pass + + @abstractmethod + def init_layer_id(self) -> None: + """Get start and end transformer layer number.""" + self._start_layer_id = -1 + self._end_layer_id = -1 + layer_count = {} + for key in self.params: + layer_id = re.findall(self.attn_layer_patten, key) + if len(layer_id) == 0: + continue + layer_id = int(layer_id[0]) + if layer_id not in layer_count: + layer_count[layer_id] = 0 + layer_count[layer_id] += 1 + if len(layer_count) == 0: + return + if not (len(layer_count) > 1 or self.last_bin): + return + max_count = max([layer_count[layer_id] for layer_id in layer_count]) + valid_layer_id = [ + layer_id for layer_id in layer_count + if layer_count[layer_id] == max_count + ] + self._start_layer_id = min(valid_layer_id) + self._end_layer_id = max(valid_layer_id) + 1 + + @abstractmethod + def clean_up(self, last: bool) -> None: + """Clean up unused params.""" + if last: + self.params.clear() + else: + to_remove = [] + for key in self.params: + layer_id = re.findall(self.attn_layer_patten, key) + if len(layer_id) == 0: + to_remove.append(key) + else: + layer_id = int(layer_id[0]) + if layer_id < self.end_layer_id: + to_remove.append(key) + for key in to_remove: + self.params.pop(key, None) + torch.cuda.empty_cache() + + @abstractmethod + def tok_embeddings(self) -> Union[torch.Tensor, None]: + """Get embeddings.""" + pass + + @abstractmethod + def norm_weight(self) -> Union[torch.Tensor, None]: + """Get norm.""" + pass + + @abstractmethod + def output_weight(self) -> Union[torch.Tensor, None]: + """Get output.""" + pass + + @abstractmethod + def attn(self, i: int) -> Tuple[torch.Tensor]: + """Get q, k, v, o weight for layer i.""" + pass + + @abstractmethod + def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]: + """Get q, k, v, o bias for layer i.""" + pass + + @abstractmethod + def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]: + """Get q, k, v, o zero point for layer i.""" + pass + + @abstractmethod + def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]: + """Get q, k, v, o scale for layer i.""" + pass + + @abstractmethod + def attn_norm(self, i: int) -> torch.Tensor: + """Get attn norm for layer i.""" + pass + + @abstractmethod + def ffn(self, i: int) -> Tuple[torch.Tensor]: + """Get ffn weight for layer i.""" + pass + + @abstractmethod + def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]: + """Get ffn zero point for layer i.""" + pass + + @abstractmethod + def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]: + """Get ffn scale for layer i.""" + pass + + @abstractmethod + def ffn_norm(self, i: int) -> torch.Tensor: + """Get ffn norm for layer i.""" + pass + + +class BaseInputModel(ABC): + """Base class for input model.""" + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs): + """Constructor for BaseInputModel. + + Args: + model_path (str): the path of the model. + tokenizer_path (str): the path of the tokenizer model. + """ + self.model_path = model_path + self.tokenizer_path = tokenizer_path + + @property + @abstractmethod + def nmgrs(self) -> int: + """Get number of checkpoint.""" + pass + + @abstractmethod + def get_mgrs(self) -> Iterator[BaseReader]: + """Conctruct all BaseReader.""" + pass + + @abstractmethod + def tokenizer_info(self): + """Read tokenizer info.""" + pass + + @abstractmethod + def model_info(self) -> Dict: + """Read model info.""" + pass + + def bins(self) -> Iterator[BaseReader]: + """Get Reader.""" + for mgr in self.get_mgrs(): + yield mgr diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py new file mode 100644 index 0000000000..f800260467 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/llama.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp + +import torch +from safetensors.torch import load_file + +from lmdeploy.tokenizer import Tokenizer + +from .base import INPUT_MODELS, BaseInputModel, BaseReader + + +class LlamaReader(BaseReader): + """LlamaReader.""" + + attn_layer_patten = r'model.layers.([0-9]+).' + tok_embeddings_key = 'model.embed_tokens.weight' + norm_weight_key = 'model.norm.weight' + output_weight_key = 'lm_head.weight' + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__() + self.params = unused_params + self.params.update(new_params) + self.last_bin = last_bin + self.init_layer_id() + + def init_layer_id(self): + """Get start/end transformer layer id.""" + super().init_layer_id() + + def clean_up(self, last: bool) -> None: + """Clean up unused params.""" + super().clean_up(last) + + @property + def start_layer_id(self): + """Get start transformer layer id.""" + return self._start_layer_id + + @property + def end_layer_id(self): + """Get end transformer layer id.""" + return self._end_layer_id + + def tok_embeddings(self): + """Get embeddings.""" + return self.params.get(self.tok_embeddings_key, None) + + def norm_weight(self): + """Get norm.""" + return self.params.get(self.norm_weight_key, None) + + def output_weight(self): + """Get output.""" + return self.params.get(self.output_weight_key, None) + + def _attn(self, i: int, kind: str, allow_none=False): + """Get q, k, v, o kind for layer i.""" + result = [] + for key in ['q', 'k', 'v', 'o']: + tensor = self.params.get( + f'model.layers.{i}.self_attn.{key}_proj.{kind}') + if not allow_none: + assert tensor is not None + result.append(tensor) + return (*result, ) + + def attn(self, i: int): + """Get q, k, v, o weight for layer i.""" + return self._attn(i, 'weight') + + def attn_bias(self, i: int): + """Get q, k, v, o bias for layer i.""" + return self._attn(i, 'bias', allow_none=True) + + def attn_zero(self, i: int): + """Get q, k, v, o zero point for layer i.""" + return (None, ) * 4 + + def attn_scale(self, i: int): + """Get q, k, v, o scale for layer i.""" + return (None, ) * 4 + + def attn_norm(self, i: int): + """Get attn norm for layer i.""" + return self.params[f'model.layers.{i}.input_layernorm.weight'] + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + result = [] + for key in ['gate', 'down', 'up']: + tensor = self.params[f'model.layers.{i}.mlp.{key}_proj.{kind}'] + result.append(tensor) + return (*result, ) + + def ffn(self, i: int): + """Get ffn weight for layer i.""" + return self._ffn(i, 'weight') + + def ffn_zero(self, i: int): + """Get ffn zero point for layer i.""" + return (None, ) * 3 + + def ffn_scale(self, i: int): + """Get ffn scale for layer i.""" + return (None, ) * 3 + + def ffn_norm(self, i: int): + """Get ffn norm for layer i.""" + return self.params[f'model.layers.{i}.post_attention_layernorm.weight'] + + +@INPUT_MODELS.register_module(name='hf') +class LlamaModel(BaseInputModel): + """Llama model in hf format.""" + + Reader = LlamaReader + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict): + super().__init__(model_path, tokenizer_path) + ckpt_path = kwargs.get('ckpt_path') + if ckpt_path is None: + ckpt_path = model_path + self.ckpt_path = ckpt_path + self.ckpt_files = self.get_ckpt() + + def get_ckpt(self): + """Get weight files.""" + suffixes = ['.safetensors', '.bin'] + files = [] + for suffix in suffixes: + files = [ + file for file in os.listdir(self.ckpt_path) + if file.endswith(suffix) + ] + if len(files) > 0: + break + files = sorted(files) + return files + + @property + def nmgrs(self): + """Get number of checkpoint.""" + return len(self.ckpt_files) + + def get_mgrs(self): + """Conctruct all Reader.""" + assert self.nmgrs > 0, \ + f'could not find checkpoints in {self.ckpt_path}' + unused_params = {} + try: + for i, ckpt in enumerate(self.ckpt_files): + is_last_bin = i == len(self.ckpt_files) - 1 + if ckpt.endswith('.bin'): + new_params = torch.load(osp.join(self.ckpt_path, ckpt), + map_location='cpu') + else: + new_params = load_file(osp.join(self.ckpt_path, ckpt)) + ret = self.Reader(new_params, unused_params, + i == self.nmgrs - 1) + yield ret + ret.clean_up(is_last_bin) + except GeneratorExit: + ret.clean_up(True) + + def tokenizer_info(self): + """Read tokenizer info.""" + assert osp.isdir(self.model_path), self.model_path + tk_model = Tokenizer(self.model_path) + n_words = tk_model.vocab_size + bos_id = tk_model.bos_token_id + eos_id = tk_model.eos_token_id + return n_words, bos_id, eos_id + + def model_info(self): + """Read model info.""" + params_path = osp.join(self.model_path, 'config.json') + with open(params_path) as f: + model_arg = json.load(f) + num_layer = model_arg['num_hidden_layers'] + norm_eps = model_arg['rms_norm_eps'] + if 'num_key_value_heads' in model_arg: + kv_head_num = model_arg['num_key_value_heads'] + else: + kv_head_num = model_arg['num_attention_heads'] + rope_theta = float(model_arg.get('rope_theta', 10000.0)) + max_position_embeddings = int( + model_arg.get('max_position_embeddings', 0)) + repo_scaling = bool(model_arg.get('rope_scaling', False)) + + return dict(num_layer=num_layer, + norm_eps=norm_eps, + kv_head_num=kv_head_num, + rope_theta=rope_theta, + max_position_embeddings=max_position_embeddings, + use_dynamic_ntk=int(repo_scaling)) diff --git a/lmdeploy/turbomind/deploy/source_model/llama_awq.py b/lmdeploy/turbomind/deploy/source_model/llama_awq.py new file mode 100644 index 0000000000..9d2ae8ac50 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/llama_awq.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +def ensure_fp16orint32(tensors: torch.Tensor): + """Ensure tensors in fp16/int32 format.""" + result = [] + for tensor in tensors: + if tensor is not None: + if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]: + result.append(tensor.half()) + else: + assert tensor.dtype == torch.int32 + result.append(tensor) + else: + result.append(None) + return (*result, ) + + +class LlamaAwqReader(LlamaReader): + """LlamaAwqReader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def attn(self, i: int): + """Get q, k, v, o qweight for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qweight')) + + def attn_zero(self, i: int): + """Get q, k, v, o qzeros for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qzeros')) + + def attn_scale(self, i: int): + """Get q, k, v, o scales for layer i.""" + return ensure_fp16orint32(self._attn(i, 'scales')) + + def ffn(self, i: int): + """Get ffn qweight for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'qweight')) + + def ffn_zero(self, i: int): + """Get ffn qzeros for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'qzeros')) + + def ffn_scale(self, i: int): + """Get ffn scales for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'scales')) + + +@INPUT_MODELS.register_module(name='hf-awq') +class LlamaAwqModel(LlamaModel): + """Llama Awq model in hf format.""" + + Reader = LlamaAwqReader + + def __init__(self, + model_path: str, + tokenizer_path: str, + ckpt_path: str = None, + **kwargs): + super().__init__(model_path, + tokenizer_path, + ckpt_path=ckpt_path, + **kwargs) diff --git a/lmdeploy/turbomind/deploy/source_model/meta_llama.py b/lmdeploy/turbomind/deploy/source_model/meta_llama.py new file mode 100644 index 0000000000..bc26361c73 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/meta_llama.py @@ -0,0 +1,224 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp +from pathlib import Path + +import torch +from sentencepiece import SentencePieceProcessor + +from .base import INPUT_MODELS, BaseInputModel, BaseReader + + +def reverse_permute(x: torch.Tensor, size_per_head: int = 128): + """reverse permute to hf format.""" + if x.shape[-1] > 1: + dim = x.shape[-1] + n_heads = dim // size_per_head + return x.view(-1, n_heads, dim // n_heads // 2, + 2).transpose(2, 3).reshape(-1, dim) + else: # scales, zeros + dim = x.shape[0] + n_heads = dim // size_per_head + return x.view(n_heads, dim // n_heads // 2, 2, + 1).transpose(1, 2).reshape(dim, 1) + + +class MetaLlamaReader(BaseReader): + """MetaLlamaReader.""" + + def __init__(self, model_path: str, start_layer_id: int, + end_layer_id: int): + super().__init__() + self._start_layer_id = start_layer_id + self._end_layer_id = end_layer_id + self.params = self.load_model(model_path) + + def init_layer_id(self): + """Empty.""" + pass + + def load_model(self, model_path): + """Load all parameters.""" + checkpoints = [] + for pattern in ['*.pth', '*.pt']: + checkpoints += sorted(Path(model_path).glob(pattern)) + n_ckpt = len(checkpoints) + model_params = {} + + def get_param(_name, _size): + if _name not in model_params: + model_params[_name] = torch.zeros(_size, + dtype=torch.float16, + device='cpu') + return model_params[_name] + + from tqdm import tqdm + pbar = tqdm(total=n_ckpt, desc='load meta ckpt', leave=False) + for i, ckpt_path in enumerate(checkpoints): + ckpt = torch.load(ckpt_path, map_location='cpu') + for param_name, param_data in ckpt.items(): + key, ext = param_name.split('.')[-2:] + # column-parallel + if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']: + size = param_data.size(0) + if ext == 'weight': + param = get_param( + param_name, + [size * n_ckpt, param_data.size(1)]) + param.data[size * i:size * (i + 1), :] = param_data + else: # bias + param = get_param(param_name, [size * n_ckpt]) + param.data[size * i:size * (i + 1)] = param_data + # row-parallel + elif key in ['w2', 'wo', 'tok_embeddings']: + size = param_data.size(-1) + if ext == 'weight': + param = get_param(param_name, + [param_data.size(0), size * n_ckpt]) + param.data[:, size * i:size * (i + 1)] = param_data + else: # bias + param = get_param(param_name, [size]) + param.data = param_data + elif i == 0: + param = get_param(param_name, param_data.size()) + param.data = param_data + del ckpt + pbar.update(1) + pbar.close() + + for name, param in model_params.items(): + # transpose all weights as TurboMind is expecting column-major + # (output_dims, input_dims) -> (input_dims, output_dims) + key = name.split('.')[-2] + if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']: + param.data = param.data.t() + if key in ['wq', 'wk']: + param.data = reverse_permute(param.data) + return model_params + + def clean_up(self, last: bool) -> None: + """Clean up unused params.""" + self.params.clear() + + @property + def start_layer_id(self): + """Get start transformer layer id.""" + return self._start_layer_id + + @property + def end_layer_id(self): + """Get end transformer layer id.""" + return self._end_layer_id + + def tok_embeddings(self): + """Get embeddings.""" + return self.params.get('tok_embeddings.weight') + + def norm_weight(self): + """Get norm.""" + return self.params.get('norm.weight') + + def output_weight(self): + """Get output.""" + return self.params.get('output.weight') + + def attn(self, i: int): + """Get q, k, v, o weight for layer i.""" + result = [] + for key in ['wq', 'wk', 'wv', 'wo']: + tensor = self.params[f'layers.{i}.attention.{key}.weight'] + tensor = tensor.t() if tensor is not None else None + result.append(tensor) + return (*result, ) + + def attn_bias(self, i: int): + """Get q, k, v, o bias for layer i.""" + result = [] + for key in ['wq', 'wk', 'wv', 'wo']: + tensor = self.params.get(f'layers.{i}.attention.{key}.bias') + tensor = tensor.t() if tensor is not None else None + result.append(tensor) + return (*result, ) + + def attn_zero(self, i: int): + """Get q, k, v, o zero point for layer i.""" + return (None, ) * 4 + + def attn_scale(self, i: int): + """Get q, k, v, o scale for layer i.""" + return (None, ) * 4 + + def attn_norm(self, i: int): + """Get attn norm for layer i.""" + return self.params[f'layers.{i}.attention_norm.weight'] + + def ffn(self, i: int): + """Get ffn weight for layer i.""" + result = [] + for key in ['w1', 'w2', 'w3']: + tensor = self.params[f'layers.{i}.feed_forward.{key}.weight'] + result.append(tensor.t()) + return (*result, ) + + def ffn_zero(self, i: int): + """Get ffn zero point for layer i.""" + return (None, ) * 3 + + def ffn_scale(self, i: int): + """Get ffn scale for layer i.""" + return (None, ) * 3 + + def ffn_norm(self, i: int): + """Get ffn norm for layer i.""" + return self.params[f'layers.{i}.ffn_norm.weight'] + + +@INPUT_MODELS.register_module(name='llama') +class MetaLlamaModel(BaseInputModel): + """Llama model in fb format.""" + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs): + super().__init__(model_path, tokenizer_path, **kwargs) + + @property + def nmgrs(self): + """Get number of checkpoint.""" + return 1 + + def get_mgrs(self): + """Conctruct all BaseReader.""" + end_layer_id = self.model_info()['num_layer'] + try: + if hasattr(self, 'meta_reader'): + yield self.meta_reader + else: + self.meta_reader = MetaLlamaReader(self.model_path, 0, + end_layer_id) + yield self.meta_reader + except GeneratorExit: + pass + + def tokenizer_info(self): + """Read tokenizer info.""" + assert osp.isfile(self.tokenizer_path), self.tokenizer_path + sp_model = SentencePieceProcessor(model_file=self.tokenizer_path) + # BOS / EOS token IDs + n_words = sp_model.vocab_size() + bos_id = sp_model.bos_id() + eos_id = sp_model.eos_id() + return n_words, bos_id, eos_id + + def model_info(self): + """Read model info.""" + params_path = osp.join(self.model_path, 'params.json') + with open(params_path) as f: + model_arg = json.load(f) + num_layer = model_arg['n_layers'] + norm_eps = model_arg['norm_eps'] + head_num = model_arg.get('n_heads', 32) + kv_head_num = model_arg.get('n_kv_heads', head_num) + + return dict(num_layer=num_layer, + norm_eps=norm_eps, + head_num=head_num, + kv_head_num=kv_head_num) diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py new file mode 100644 index 0000000000..09ff93afc5 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/qwen.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path as osp + +import torch + +from .base import INPUT_MODELS +from .llama import LlamaModel, LlamaReader + + +class QwenReader(LlamaReader): + """QwenReader.""" + + attn_layer_patten = r'transformer.h.([0-9]+).' + tok_embeddings_key = 'transformer.wte.weight' + norm_weight_key = 'transformer.ln_f.weight' + output_weight_key = 'lm_head.weight' + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0): + """Get q, k, v, o kind for layer i.""" + qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}'] + q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim) + o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None) + if o is None: + o = torch.zeros_like(q) + return q, k, v, o + + def attn(self, i: int): + """Get q, k, v, o weight for layer i.""" + return self._attn(i, 'weight', 0, 0) + + def attn_bias(self, i: int): + """Get q, k, v, o bias for layer i.""" + return self._attn(i, 'bias', -1, 0) + + def attn_zero(self, i: int): + """Get q, k, v, o zero point for layer i.""" + return (None, ) * 4 + + def attn_scale(self, i: int): + """Get q, k, v, o scale for layer i.""" + return (None, ) * 4 + + def attn_norm(self, i: int): + """Get attn norm for layer i.""" + return self.params[f'transformer.h.{i}.ln_1.weight'] + + def _ffn(self, i: int, kind: str): + """Get ffn kind for layer i.""" + result = [] + for key in ['w2', 'c_proj', 'w1']: + tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}'] + result.append(tensor) + return (*result, ) + + def ffn(self, i: int): + """Get ffn weight for layer i.""" + return self._ffn(i, 'weight') + + def ffn_zero(self, i: int): + """Get ffn zero point for layer i.""" + return (None, ) * 3 + + def ffn_scale(self, i: int): + """Get ffn scale for layer i.""" + return (None, ) * 3 + + def ffn_norm(self, i: int): + """Get ffn norm for layer i.""" + return self.params[f'transformer.h.{i}.ln_2.weight'] + + +@INPUT_MODELS.register_module(name='qwen') +class QwenModel(LlamaModel): + """Qwen model in hf format.""" + + Reader = QwenReader + + def __init__(self, model_path: str, tokenizer_path: str, **kwargs): + super().__init__(model_path, tokenizer_path, **kwargs) + + def tokenizer_info(self): + """Read tokenizer info.""" + n_words = 151851 + bos_id = 0 + eos_id = 151643 + return n_words, bos_id, eos_id + + def model_info(self): + """Read model info.""" + params_path = osp.join(self.model_path, 'config.json') + with open(params_path) as f: + config = json.load(f) + num_layer = config['num_hidden_layers'] + norm_eps = config['layer_norm_epsilon'] + rope_theta = float(config.get('rotary_emb_base', 10000.0)) + if 'num_key_value_heads' in config: + kv_head_num = config['num_key_value_heads'] + else: + kv_head_num = config['num_attention_heads'] + seq_length = config['seq_length'] + use_dynamic_ntk = int(config['use_dynamic_ntk']) + use_logn_attn = int(config['use_logn_attn']) + return dict(num_layer=num_layer, + norm_eps=norm_eps, + kv_head_num=kv_head_num, + rope_theta=rope_theta, + max_position_embeddings=seq_length, + use_dynamic_ntk=int(use_dynamic_ntk), + use_logn_attn=use_logn_attn) diff --git a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py new file mode 100644 index 0000000000..04df2ac729 --- /dev/null +++ b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import INPUT_MODELS +from .llama_awq import ensure_fp16orint32 +from .qwen import QwenModel, QwenReader + + +class QwenAwqReader(QwenReader): + """QwenAwqReader.""" + + def __init__(self, new_params: dict, unused_params: dict, last_bin: bool): + super().__init__(new_params, unused_params, last_bin) + + def attn(self, i: int): + """Get q, k, v, o qweight for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1)) + + def attn_bias(self, i: int): + """Get q, k, v, o bias for layer i.""" + return ensure_fp16orint32(self._attn(i, 'bias', -1, 0)) + + def attn_zero(self, i: int): + """Get q, k, v, o qzeros for layer i.""" + return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1)) + + def attn_scale(self, i: int): + """Get q, k, v, o scales for layer i.""" + return ensure_fp16orint32(self._attn(i, 'scales', -1, -1)) + + def ffn(self, i: int): + """Get ffn qweight for layer i.""" + # ours: w2(silu(w1(x)) * w3(x)) + # qwen: c_proj(w1(x) * silu(w2(x))) + return ensure_fp16orint32(self._ffn(i, 'qweight')) + + def ffn_zero(self, i: int): + """Get ffn qzeros for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'qzeros')) + + def ffn_scale(self, i: int): + """Get ffn scales for layer i.""" + return ensure_fp16orint32(self._ffn(i, 'scales')) + + +@INPUT_MODELS.register_module(name='qwen-awq') +class QwenAwqModel(QwenModel): + """Qwen awq model in hf format.""" + + Reader = QwenAwqReader + + def __init__(self, + model_path: str, + tokenizer_path: str, + ckpt_path: str = None, + **kwargs): + super().__init__(model_path, + tokenizer_path, + ckpt_path=ckpt_path, + **kwargs) diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py new file mode 100644 index 0000000000..fe03500e45 --- /dev/null +++ b/lmdeploy/turbomind/deploy/target_model/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .fp import TurbomindModel # noqa: F401 +from .w4 import TurbomindW4Model # noqa: F401 diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py new file mode 100644 index 0000000000..5e9b5341f7 --- /dev/null +++ b/lmdeploy/turbomind/deploy/target_model/base.py @@ -0,0 +1,236 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import configparser +import inspect +import os.path as osp +from abc import ABC, abstractmethod +from dataclasses import dataclass + +import torch +import tqdm +from mmengine import Registry + +from lmdeploy.model import MODELS + +from ..source_model.base import BaseInputModel, BaseReader + +OUTPUT_MODELS = Registry( + 'target model', locations=['lmdeploy.turbomind.deploy.target_model.base']) + + +def tprint(*args, **kwargs): + from io import StringIO + s = StringIO() + print(*args, **kwargs, file=s, end='') + tqdm.tqdm.write(s.getvalue()) + + +@dataclass +class TurbomindModelConfig: + """Config for turbomind model.""" + model_name: str + tensor_para_size: int + head_num: int + kv_head_num: int + vocab_size: int + num_layer: int + inter_size: int + norm_eps: float + attn_bias: int + start_id: int + end_id: int + session_len: int + weight_type: str = 'fp16' + rotary_embedding: int = 128 + rope_theta: float = 10000.0 + size_per_head: int = 128 + group_size: int = 0 + max_batch_size: int = 32 + max_context_token_num: int = 4 + step_length: int = 1 + cache_max_entry_count: int = 48 + cache_chunk_size: int = 1 + use_context_fmha: int = 1 + quant_policy: int = 0 + max_position_embeddings: int = 0 + use_dynamic_ntk: int = 0 + use_logn_attn: int = 0 + + @classmethod + def from_dict(cls, env, allow_none=False): + """Construct from dict.""" + params = inspect.signature(cls).parameters + used = {k: v for k, v in env.items() if k in params and v is not None} + if not allow_none: + return cls(**used) + else: + default = { + k: None + for k in params.keys() if params[k].default is inspect._empty + } + default.update(used) + return cls(**default) + + @property + def valid(self): + """Check if cfg is valid.""" + for _, v in self.__dict__.items(): + if v is None: + return False + return True + + +class BaseOutputModel(ABC): + """Base output model.""" + + def __init__(self, + input_model: BaseInputModel, + cfg: TurbomindModelConfig, + to_file: bool = True, + out_dir: str = ''): + super().__init__() + self.input_model = input_model + self.cfg = self.get_config(cfg) + assert self.cfg.valid + self.to_file = to_file + self.out_dir = out_dir + + @abstractmethod + def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig: + """Generate turbomind model config (config.ini).""" + _, bos_id, eos_id = self.input_model.tokenizer_info() + model = MODELS.get(cfg.model_name)() + final_cfg = cfg.__dict__ + final_cfg.update( + dict(start_id=bos_id, + end_id=eos_id, + session_len=model.session_len + 8)) + final_cfg.update(self.input_model.model_info()) + + # head_num, vocab_size + for bin in self.input_model.bins(): + emb = bin.tok_embeddings() + if emb is not None: + _vocab_size, dim = emb.shape + head_num = dim // cfg.size_per_head + break + final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size)) + return TurbomindModelConfig.from_dict(final_cfg, allow_none=True) + + def export_config(self) -> None: + """export turbomind config.""" + if self.to_file: + config = configparser.ConfigParser() + cfg = dict(llama=self.cfg.__dict__) + for section, key_values in cfg.items(): + config[section] = key_values + config_path = osp.join(self.out_dir, 'config.ini') + with open(config_path, 'w') as f: + config.write(f) + + def export_weight(self, param: torch.Tensor, name: str) -> None: + """export turbomind weight.""" + if self.to_file: + if param.dtype in [torch.float, torch.bfloat16]: + param = param.half() + tprint(name, param.shape) + param.contiguous().cpu().numpy().tofile( + osp.join(self.out_dir, name)) + + def save_split(self, + tensor: torch.Tensor, + name: str, + split_dim=None, + copy=False) -> None: + """save split.""" + tp = self.cfg.tensor_para_size + if split_dim is not None: + tprint(f'*** splitting {name}, shape={tensor.shape}, ' + f'split_dim={split_dim}, tp={tp}') + assert tensor.shape[split_dim] % tp == 0 + split_size = tensor.shape[split_dim] // tp + splits = torch.split(tensor, split_size, dim=split_dim) + for i, split in enumerate(splits): + prefix, ext = osp.splitext(name) + self.export_weight(split, f'{prefix}.{i}{ext}') + elif copy: + tprint(f'### copying {name}, shape={tensor.shape}') + copies = [tensor] * tp + for i, copy in enumerate(copies): + prefix, ext = osp.splitext(name) + self.export_weight(copy, f'{prefix}.{i}{ext}') + else: + self.export_weight(tensor, name) + + def export(self) -> None: + """Export to turbomind model format.""" + num_layer = self.cfg.num_layer + from tqdm import tqdm + pbar = tqdm(total=num_layer, desc='Convert to turbomind format') + self.export_config() + for bin in self.input_model.bins(): + self.export_misc(bin) + for i in range(bin.start_layer_id, bin.end_layer_id): + self.export_transformer_block(bin, i) + pbar.update(1) + pbar.close() + # manually clean up meta reader + if hasattr(self.input_model, 'meta_reader'): + self.input_model.meta_reader.clean_up(True) + del self.input_model.meta_reader + torch.cuda.empty_cache() + + def export_misc(self, bin: BaseReader) -> None: + """Export embedding, norm, output weight.""" + emb = bin.tok_embeddings() + norm_weight = bin.norm_weight() + output_weight = bin.output_weight() + + def pad_weight(tensor): + pad_size = None + vocab_size = self.cfg.vocab_size + tp = self.cfg.tensor_para_size + if vocab_size % tp != 0: + pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size + + if pad_size is None: + return tensor + return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size), + 'constant', 0) + + if emb is not None: + emb = pad_weight(emb) + self.export_weight(emb, 'tok_embeddings.weight') + if norm_weight is not None: + self.export_weight(norm_weight, 'norm.weight') + if output_weight is not None: + output_weight = pad_weight(output_weight) + self.export_weight(output_weight, 'output.weight') + + @abstractmethod + def export_transformer_block(self, bin: BaseReader, i: int) -> None: + """Export transformer block.""" + pass + + +def permute(x: torch.Tensor, size_per_head: int = 128): + if x.shape[-1] > 1: + dim = x.shape[-1] + n_heads = dim // size_per_head + return x.view(-1, n_heads, 2, + dim // n_heads // 2).transpose(2, 3).reshape(-1, dim) + else: # scales, zeros + dim = x.shape[0] + n_heads = dim // size_per_head + return x.view(n_heads, 2, dim // n_heads // 2, + 1).transpose(1, 2).reshape(dim, 1) + + +def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int, + dim: int): + + def reshape(x): + return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1) + + qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1) + # (input_dim, head_num + 2 * kv_head_num) + return qkv.view(q.size(0), -1) diff --git a/lmdeploy/turbomind/deploy/target_model/fp.py b/lmdeploy/turbomind/deploy/target_model/fp.py new file mode 100644 index 0000000000..d9a7783436 --- /dev/null +++ b/lmdeploy/turbomind/deploy/target_model/fp.py @@ -0,0 +1,80 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch + +from ..source_model.base import BaseInputModel, BaseReader +from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig, + merge_qkv, permute) + + +def transpose_tensor(input: List[torch.Tensor]): + """Transpose tensor.""" + output = [x.cuda().t() for x in input] + return output + + +@OUTPUT_MODELS.register_module(name='fp16') +class TurbomindModel(BaseOutputModel): + """Export to turbomind fp16 format.""" + + def __init__(self, + input_model: BaseInputModel, + cfg: TurbomindModelConfig, + to_file: bool = True, + out_dir: str = ''): + super().__init__(input_model, cfg, to_file, out_dir) + + def get_config(self, cfg: TurbomindModelConfig): + """Get turbomind config.""" + final_cfg = super().get_config(cfg).__dict__ + + # attn_bias, inter_size + visit = False + attn_bias = 0 + for bin in self.input_model.bins(): + for i in range(bin.start_layer_id, bin.end_layer_id): + visit = True + w1, _, _ = bin.ffn(i) + inter_size = w1.t().shape[-1] + qb, _, _, _ = bin.attn_bias(i) + if qb is not None: + attn_bias = 1 + break + if visit: + break + final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size)) + return TurbomindModelConfig.from_dict(final_cfg) + + def export_transformer_block(self, bin: BaseReader, i: int): + """Export transformer layer i.""" + assert bin.start_layer_id <= i < bin.end_layer_id + tp = self.cfg.tensor_para_size + size_per_head = self.cfg.size_per_head + # attn + qw, kw, vw, ow = bin.attn(i) + qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow]) + qw = permute(qw, size_per_head) + kw = permute(kw, size_per_head) + qkv_w = merge_qkv(qw, kw, vw, tp, dim=2) + self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1) + self.save_split(ow, f'layers.{i}.attention.wo.weight', 0) + qb, kb, vb, ob = bin.attn_bias(i) + if qb is not None: + qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob]) + qb = permute(qb, size_per_head) + kb = permute(kb, size_per_head) + qkv_b = merge_qkv(qb, kb, vb, tp, dim=1) + self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1) + self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True) + # ffn + w1, w2, w3 = bin.ffn(i) + w1, w2, w3 = transpose_tensor([w1, w2, w3]) + self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1) + self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1) + self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0) + # norm + attn_norm = bin.attn_norm(i) + ffn_norm = bin.ffn_norm(i) + self.save_split(attn_norm, f'layers.{i}.attention_norm.weight') + self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight') diff --git a/lmdeploy/turbomind/deploy/target_model/w4.py b/lmdeploy/turbomind/deploy/target_model/w4.py new file mode 100644 index 0000000000..282c7df607 --- /dev/null +++ b/lmdeploy/turbomind/deploy/target_model/w4.py @@ -0,0 +1,162 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import sys + +import torch + +import lmdeploy + +from ..source_model.base import BaseInputModel, BaseReader +from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig, + merge_qkv, permute) + +# import _turbomind as _tm +# TODO: find another way import _turbomind +lmdeploy_dir = osp.split(lmdeploy.__file__)[0] +sys.path.append(osp.join(lmdeploy_dir, 'lib')) +import _turbomind as _tm # noqa: E402 + + +def transpose_qk_s4(src: torch.Tensor, group_size): + assert src.is_contiguous() + dst = torch.zeros_like(src) + _tm.transpose_qk_s4_k_m8(src, dst, + src.size(-1) * 8, src.size(0), group_size) + return dst + + +def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor, + w3_qw: torch.Tensor, w3_qz: torch.Tensor, + w3_s: torch.Tensor): + + def fuse(a: torch.Tensor, b: torch.Tensor): + ab = torch.cat((a, b)).contiguous() + _ab = torch.zeros_like(ab) + _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0)) + return _ab.view(a.size(0), -1) + + w13_qw = fuse(w1_qw, w3_qw) + w13_qz = fuse(w1_qz, w3_qz) + + w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1) + w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1) + + return w13_qw, w13_qz, w13_s + + +def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor, + group_size: int): + assert qw.is_contiguous() + assert qz.is_contiguous() + assert s.is_contiguous() + _qw = torch.zeros_like(qw) + _sz = torch.zeros_like(s, dtype=torch.int32) # half2 + _ws = torch.zeros_like(s) + _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz, + qw.size(-1) * 8, qw.size(0), group_size) + return _qw, _sz + + +def tp_m_s4(x: torch.Tensor, tp: int): + return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3, + 1).contiguous() + + +def get_cuda_tensor(tensors): + """Get cuda tensor.""" + result = map(lambda x: x.cuda() if x is not None else x, tensors) + return (*result, ) + + +@OUTPUT_MODELS.register_module(name='w4') +class TurbomindW4Model(BaseOutputModel): + """Export to turbomind w4a16 format.""" + + def __init__(self, + input_model: BaseInputModel, + cfg: TurbomindModelConfig, + to_file: bool = True, + out_dir: str = ''): + super().__init__(input_model, cfg, to_file, out_dir) + + def get_config(self, cfg: TurbomindModelConfig): + """Get turbomind config.""" + final_cfg = super().get_config(cfg).__dict__ + + # attn_bias, inter_size + visit = False + attn_bias = 0 + for bin in self.input_model.bins(): + for i in range(bin.start_layer_id, bin.end_layer_id): + visit = True + w1s, _, _ = bin.ffn_scale(i) + inter_size = w1s.shape[-1] + qb, _, _, _ = bin.attn_bias(i) + if qb is not None: + attn_bias = 1 + break + if visit: + break + final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size)) + return TurbomindModelConfig.from_dict(final_cfg) + + def export_transformer_block(self, bin: BaseReader, i: int): + """Export transformer layer i.""" + group_size = self.cfg.group_size + tp = self.cfg.tensor_para_size + size_per_head = self.cfg.size_per_head + # attn + q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i)) + q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i)) + q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i)) + + q_qw = transpose_qk_s4(q_qw, group_size) + k_qw = transpose_qk_s4(k_qw, group_size) + q_qz = transpose_qk_s4(q_qz, group_size) + k_qz = transpose_qk_s4(k_qz, group_size) + q_s = permute(q_s, size_per_head) + k_s = permute(k_s, size_per_head) + + qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2) + qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2) + qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2) + + qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size) + qkv_qw = tp_m_s4(qkv_qw, tp) + self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1) + self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1) + + o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size) + self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0) + self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0) + + q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i)) + if q_b is not None: + q_b = permute(q_b, size_per_head) + k_b = permute(k_b, size_per_head) + qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1) + self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1) + self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True) + + # ffn weights + w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i)) + w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i)) + w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i)) + + w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz, + w3_s) + w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size) + w13_qw = tp_m_s4(w13_qw, tp) + self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1) + self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros', + -1) + + w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size) + self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0) + self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0) + + # norm + attn_norm = bin.attn_norm(i) + ffn_norm = bin.ffn_norm(i) + self.save_split(attn_norm, f'layers.{i}.attention_norm.weight') + self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight') diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py index 328f182158..9a4f0e8c4d 100644 --- a/lmdeploy/turbomind/generate_gemm_config.py +++ b/lmdeploy/turbomind/generate_gemm_config.py @@ -2,8 +2,6 @@ import subprocess -import fire - def get_llama_gemm(): import os.path as osp @@ -30,4 +28,6 @@ def main(head_num: int = 32, if __name__ == '__main__': + import fire + fire.Fire(main) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index dcfc499e89..9d2186fea9 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -13,7 +13,7 @@ from torch.nn.utils.rnn import pad_sequence import lmdeploy -from lmdeploy.model import MODELS +from lmdeploy.model import MODELS, BaseModel from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger @@ -78,7 +78,11 @@ class TurboMind: tp (int): tensor parallel """ - def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): + def __init__(self, + model_path: str, + eos_id: int = 2, + tp: int = 1, + **kwargs): self.eos_id = eos_id # TODO: support mpi @@ -88,7 +92,6 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): # read meta from model path assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n' self.gpu_count = tp - self.session_len = 2048 data_type = 'fp16' ini_path = osp.join(model_path, 'triton_models/weights/config.ini') with open(ini_path, 'r') as f: @@ -102,18 +105,18 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): if len(section_name) > 0: tp_cfg = parser.getint(section_name, 'tensor_para_size') - self.session_len = parser.getint(section_name, 'session_len') if tp_cfg != 1 and tp_cfg != tp: get_logger('turbomind').info( f'found tp={tp_cfg} in config.ini.') self.gpu_count = tp_cfg self.model_name = parser.get(section_name, 'model_name') data_type = parser.get(section_name, 'weight_type') - model = MODELS.get(self.model_name)() + self.model: BaseModel = MODELS.get(self.model_name)(**kwargs) + self.session_len = self.model.session_len tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer') tokenizer = Tokenizer(tokenizer_model_path) - self.stop_words = _stop_words(model.stop_words, tokenizer) + self.stop_words = _stop_words(self.model.stop_words, tokenizer) # params self.node_id = node_id @@ -122,17 +125,17 @@ def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): # create model weight_dir = osp.join(model_path, 'triton_models', 'weights') - model = _tm.AbstractTransformerModel.create_llama_model( + model_comm = _tm.AbstractTransformerModel.create_llama_model( weight_dir, tensor_para_size=self.gpu_count, data_type=data_type) - self.model = model - self.nccl_params = model.create_nccl_params(self.node_id) + self.model_comm = model_comm + self.nccl_params = model_comm.create_nccl_params(self.node_id) torch.cuda.synchronize() # create weight def _create_weight(device_id): with cuda_ctx(device_id): rank = self.node_id * self.gpu_count + device_id - model.create_shared_weights(device_id, rank) + model_comm.create_shared_weights(device_id, rank) threads = [] for device_id in range(self.gpu_count): @@ -161,7 +164,7 @@ class TurboMindInstance: cuda_stream_id(int): identity of a cuda stream """ - def __init__(self, tm_model, cuda_stream_id=0): + def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0): self.tm_model = tm_model self.cuda_stream_id = cuda_stream_id @@ -175,7 +178,7 @@ def __init__(self, tm_model, cuda_stream_id=0): self.session_len = tm_model.session_len self.nccl_params = tm_model.nccl_params - self.instance_comm = tm_model.model.create_instance_comm( + self.instance_comm = tm_model.model_comm.create_instance_comm( self.gpu_count) # create model instances @@ -196,7 +199,7 @@ def __init__(self, tm_model, cuda_stream_id=0): def _create_model_instance(self, device_id, model_insts): with cuda_ctx(device_id): rank = self.node_id * self.gpu_count + device_id - model_inst = self.tm_model.model.create_model_instance( + model_inst = self.tm_model.model_comm.create_model_instance( device_id, rank, self.cuda_stream_id, self.nccl_params) model_insts[device_id] = model_inst @@ -266,7 +269,7 @@ def stream_infer(self, self.model_insts[0].register_callback(self._forward_callback) if len(input_ids) == 0: - input_ids = [] + input_ids = [[]] if isinstance(input_ids[0], int): input_ids = [input_ids] @@ -381,7 +384,7 @@ def decode(self, input_ids): """ if len(input_ids) == 0: - input_ids = [] + input_ids = [[]] if isinstance(input_ids[0], int): input_ids = [input_ids] diff --git a/lmdeploy/version.py b/lmdeploy/version.py index 417dc76768..0bd4914cc4 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple -__version__ = '0.0.11' +__version__ = '0.0.14' short_version = __version__ diff --git a/requirements.txt b/requirements.txt index 9eacb498fb..27049672c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ accelerate datasets fastapi fire -gradio +gradio<4.0.0 mmengine numpy pybind11 diff --git a/setup.py b/setup.py index 09ae1e31c2..df36118c23 100644 --- a/setup.py +++ b/setup.py @@ -121,26 +121,29 @@ def gen_packages_items(): if __name__ == '__main__': lmdeploy_package_data = ['lmdeploy/bin/llama_gemm'] - setup(name='lmdeploy', - version=get_version(), - description='A toolset for compressing, deploying and serving LLM', - long_description=readme(), - long_description_content_type='text/markdown', - author='OpenMMLab', - author_email='openmmlab@gmail.com', - packages=find_packages(exclude=()), - package_data={ - 'lmdeploy': lmdeploy_package_data, - }, - include_package_data=True, - install_requires=parse_requirements('requirements.txt'), - has_ext_modules=check_ext_modules, - classifiers=[ - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Intended Audience :: Developers', - 'Intended Audience :: Education', - 'Intended Audience :: Science/Research', - ]) + setup( + name='lmdeploy', + version=get_version(), + description='A toolset for compressing, deploying and serving LLM', + long_description=readme(), + long_description_content_type='text/markdown', + author='OpenMMLab', + author_email='openmmlab@gmail.com', + packages=find_packages(exclude=()), + package_data={ + 'lmdeploy': lmdeploy_package_data, + }, + include_package_data=True, + install_requires=parse_requirements('requirements.txt'), + has_ext_modules=check_ext_modules, + classifiers=[ + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + ], + entry_points={'console_scripts': ['lmdeploy = lmdeploy.cli:run']}, + ) diff --git a/tests/test_lmdeploy/test_cli.py b/tests/test_lmdeploy/test_cli.py new file mode 100644 index 0000000000..a41eab442e --- /dev/null +++ b/tests/test_lmdeploy/test_cli.py @@ -0,0 +1,51 @@ +import inspect + + +def compare_func(class_method, function): + """Compare if a class method has same arguments as a function.""" + + argspec_cls = inspect.getfullargspec(class_method) + argspec_func = inspect.getfullargspec(function) + assert argspec_cls.args[1:] == argspec_func.args + assert argspec_cls.defaults == argspec_func.defaults + assert argspec_cls.annotations == argspec_func.annotations + + +def test_cli(): + + from lmdeploy.cli.cli import CLI + from lmdeploy.serve.turbomind.deploy import main as convert + compare_func(CLI.convert, convert) + + +def test_subcli_chat(): + from lmdeploy.cli.chat import SubCliChat + from lmdeploy.pytorch.chat import main as run_torch_model + from lmdeploy.turbomind.chat import main as run_turbomind_model + + compare_func(SubCliChat.torch, run_torch_model) + compare_func(SubCliChat.turbomind, run_turbomind_model) + + +def test_subcli_lite(): + from lmdeploy.cli.lite import SubCliLite + from lmdeploy.lite.apis.auto_awq import auto_awq + from lmdeploy.lite.apis.calibrate import calibrate + from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams + + compare_func(SubCliLite.auto_awq, auto_awq) + compare_func(SubCliLite.calibrate, calibrate) + compare_func(SubCliLite.kv_qparams, run_kv_qparams) + + +def test_subcli_serve(): + from lmdeploy.cli.serve import SubCliServe + from lmdeploy.serve.client import main as run_triton_client + from lmdeploy.serve.gradio.app import run as run_gradio + from lmdeploy.serve.openai.api_client import main as run_api_client + from lmdeploy.serve.openai.api_server import main as run_api_server + + compare_func(SubCliServe.gradio, run_gradio) + compare_func(SubCliServe.api_server, run_api_server) + compare_func(SubCliServe.api_client, run_api_client) + compare_func(SubCliServe.triton_client, run_triton_client) diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py new file mode 100644 index 0000000000..ff7d8047b2 --- /dev/null +++ b/tests/test_lmdeploy/test_tokenizer.py @@ -0,0 +1,24 @@ +import pytest + +from lmdeploy.tokenizer import HuggingFaceTokenizer + + +@pytest.mark.parametrize('model_path', [ + 'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat', + 'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf', + 'upstage/SOLAR-0-70b-16bit' +]) +@pytest.mark.parametrize( + 'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5]) +def test_tokenizer(model_path, input): + tokenizer = HuggingFaceTokenizer(model_path) + encoded = tokenizer.encode(input) + output = '' + offset = 0 + for i in range(1, len(encoded) + 1): + decoded = tokenizer.decode(encoded[:i], offset) + if decoded.endswith('�'): + continue + output += decoded + offset = i + assert input == output, 'input string should equal to output after enc-dec'