From b470f06011a9d64bdd6039f4167cab5fa9f1b6f2 Mon Sep 17 00:00:00 2001 From: irexyc Date: Wed, 22 Nov 2023 15:23:35 +0800 Subject: [PATCH] udpate --- docs/en/load_hf.md | 27 +++++++++---------- docs/zh_cn/load_hf.md | 22 +++++++-------- lmdeploy/turbomind/turbomind.py | 2 +- .../triton_backend/llama/LlamaTritonModel.cc | 9 +------ 4 files changed, 26 insertions(+), 34 deletions(-) diff --git a/docs/en/load_hf.md b/docs/en/load_hf.md index 9e3f1c836f..ddf6fe8bfd 100644 --- a/docs/en/load_hf.md +++ b/docs/en/load_hf.md @@ -1,41 +1,40 @@ # Load huggingface model directly -Before v0.0.14, if you want to serving or inference by TurboMind, you should first convert the model to TurboMind format. Through offline conversion, the model can be loaded faster, but it isn't user-friendly. Therefore, LMDeploy adds the ability of online conversion and support loading huggingface model directly. +Starting from v0.1.0, Turbomind adds the ability to pre-process the model parameters on-the-fly while loading them from huggingface style models. ## Supported model type Currently, Turbomind support loading three types of model: 1. A lmdeploy-quantized model hosted on huggingface.co, such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit), etc. -2. Other hot LM models on huggingface.co like Qwen/Qwen-7B-Chat -3. A model converted by `lmdeploy convert`, old format +2. Other LM models on huggingface.co like Qwen/Qwen-7B-Chat +3. A model converted by `lmdeploy convert`, legacy format ## Usage -### 1) A quantized model managed by lmdeploy / internlm +### 1) A lmdeploy-quantized model -For quantized models managed by lmdeploy or internlm, the parameters required for online conversion are already exist in config.json, so you only need to pass the repo_id or local path when using it. - -> If config.json has not been updated in time, you need to pass the `--model-name` parameter, please refer to 2) +For models quantized by `lmdeploy.lite` such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit), etc. ``` -repo_id=lmdeploy/qwen-chat-7b-4bit +repo_id=internlm/internlm-chat-20b-4bit +model_name=internlm-chat-20b # or -# repo_id=/path/to/managed_model +# repo_id=/path/to/downloaded_model # Inference by TurboMind -lmdeploy chat turbomind $repo_id +lmdeploy chat turbomind $repo_id --model-name $model_name # Serving with gradio -lmdeploy serve gradio $repo_id +lmdeploy serve gradio $repo_id --model-name $model_name # Serving with Restful API -lmdeploy serve api_server $repo_id --instance_num 32 --tp 1 +lmdeploy serve api_server $repo_id --model-name $model_name --instance_num 32 --tp 1 ``` -### 2) Other hot LM models +### 2) Other LM models -For other popular models such as Qwen/Qwen-7B-Chat or baichuan-inc/Baichuan2-7B-Chat, the name of the model needs to be passed in. LMDeploy supported models can be viewed through `lmdeploy list`. +For other LM models such as Qwen/Qwen-7B-Chat or baichuan-inc/Baichuan2-7B-Chat. LMDeploy supported models can be viewed through `lmdeploy list`. ``` repo_id=Qwen/Qwen-7B-Chat diff --git a/docs/zh_cn/load_hf.md b/docs/zh_cn/load_hf.md index 1ad37e3ae7..63c08fe2d9 100644 --- a/docs/zh_cn/load_hf.md +++ b/docs/zh_cn/load_hf.md @@ -1,6 +1,6 @@ # 直接读取 huggingface 模型 -在 V0.0.14 版本之前,若想使用 LMDeploy 进行推理或者部署,需要先使用命令 `lmdeploy convert` 将模型离线转换为 TurboMind 推理引擎支持的格式,转换后的模型可以更快地进行加载,但对用户使用来说并不友好,因此,LDMdeploy 决定增加在线转换的功能,支持直接读取 Huggingface 的模型。 +从 v0.1.0 开始,Turbomid 添加了直接读取 Huggingface 格式权重的能力。 ## 支持的类型 @@ -12,30 +12,30 @@ ## 使用方式 -### 1) lmdeploy / internlm 所管理的量化模型 +### 1) 通过 lmdeploy 量化的模型 -lmdeploy / internlm 所管理的模型,config.json 中已经有在线转换需要的参数,所以使用时只需要传入 repo_id 或者本地路径即可。 - -> 如果 config.json 还未及时更新,还需要传入`--model-name` 参数,可参考 2) +对于通过 `lmdeploy.lite` 量化的模型,TurboMind 可以直接加载,比如 [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit). ``` -repo_id=lmdeploy/qwen-chat-7b-4bit +repo_id=internlm/internlm-chat-20b-4bit +model_name=internlm-chat-20b + # or -# repo_id=/path/to/managed_model +# repo_id=/path/to/downloaded_model # Inference by TurboMind -lmdeploy chat turbomind $repo_id +lmdeploy chat turbomind $repo_id --model-name $model_name # Serving with gradio -lmdeploy serve gradio $repo_id +lmdeploy serve gradio $repo_id --model-name $model_name # Serving with Restful API -lmdeploy serve api_server $repo_id --instance_num 32 --tp 1 +lmdeploy serve api_server $repo_id --model-name $model_name --instance_num 32 --tp 1 ``` ### 2) 其他的 LM 模型 -其他的比较热门的模型比如 Qwen/Qwen-7B-Chat, baichuan-inc/Baichuan2-7B-Chat,需要传入模型的名字。LMDeploy 模型支持情况可通过 `lmdeploy list` 查看。 +其他 LM 模型比如 Qwen/Qwen-7B-Chat, baichuan-inc/Baichuan2-7B-Chat。LMDeploy 模型支持情况可通过 `lmdeploy list` 查看。 ``` repo_id=Qwen/Qwen-7B-Chat diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index a7cf090c65..8668dd803a 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -132,7 +132,7 @@ def __init__(self, self.stop_words = _stop_words(self.model.stop_words, self.tokenizer) def _create_weight(self, model_comm): - """Allocate wegiht buffer, load params if from_workspace.""" + """Allocate weight buffer, load params if from_workspace.""" # TODO: support mpi self.node_id = 0 diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index 9c23c50614..8751024ddd 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -119,14 +119,7 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, enable_custom_all_reduce_(enable_custom_all_reduce) { INIReader reader; - if (!config.empty() && !model_dir.empty()) { - TM_LOG_ERROR("[ERROR] config and model_dir are all set"); - ft::FT_CHECK(false); - } - else if (config.empty() && model_dir.empty()) { - TM_LOG_ERROR("[ERROR] Neither config nor model_dir is set"); - ft::FT_CHECK(false); - } + ft::FT_CHECK_WITH_INFO((config.empty() ^ model_dir.empty()), "invalid init options"); if (!config.empty()) { std::FILE* tmpf = std::tmpfile();