Support InternLM 20B (#440)

* better profiler * wait for releasing mem * remove fire * remove support for multiple model benchmark * comments * support actual seqlen * change chat template * update * fix ut * int->size_t * output more details * correct tp * rollback * update * update readme * add 'internlm-chat' as the default tag for internlm chat models * rollback tokenizer --------- Co-authored-by: AllentDan <[email protected]> Co-authored-by: grimoire <[email protected]>
InternLM · Sep 20, 2023 · df7955d · df7955d
1 parent 19ff47d
commit df7955d
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports InternLM-20B
 - \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
 - \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
@@ -61,19 +62,20 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 | :----------: | :-------------: | :--: | :-----: | :---: | :--: |
 |    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
 |  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   No    |  No   |  No  |
-|  Llama2  |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM |       Yes       | Yes  |   No    |  No   |  No  |
+|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
+|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
+| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
 
 ## Performance
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -20,6 +20,7 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 InternLM-20B 模型
 - \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
 - \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
@@ -62,19 +63,20 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 | :----------: | :------: | :--: | :-----: | :---: | :--: |
 |    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|   InternLM   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 |   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
 | Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
 | Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
 |  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   No    |  No   |  No  |
-|  Llama2  |   Yes    | Yes  |   No    |  No   |  No  |
-| InternLM |   Yes    | Yes  |   No    |  No   |  No  |
+|    模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama    |   Yes    | Yes  |   No    |  No   |  No  |
+|   Llama2    |   Yes    | Yes  |   No    |  No   |  No  |
+| InternLM-7B |   Yes    | Yes  |   No    |  No   |  No  |
 
 ## 性能
 

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -55,7 +55,7 @@ def get_prompt(self, prompt, sequence_start=True):
 
     @abstractmethod
     def decorate_prompt(self, prompt, sequence_start):
-        pass
+        return prompt
 
     @staticmethod
     def _translate_messages(messages: List):
@@ -169,14 +169,15 @@ def messages2prompt(self, messages, sequence_start=True):
         return ret
 
 
+@MODELS.register_module(name='internlm-chat')
 @MODELS.register_module(name='internlm-chat-7b')
 class InternLMChat7B(BaseModel):
     """Chat template of InternLM model."""
 
     def __init__(self,
                  system='',
                  user='<|User|>',
-                 eoh='<eoh>',
+                 eoh='',
                  eoa='<eoa>',
                  assistant='<|Bot|>',
                  **kwargs):
@@ -223,27 +224,41 @@ def messages2prompt(self, messages, sequence_start=True):
         for user, assistant in zip(users, assistants):
             if assistant:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:' \
-                       f'{assistant}{self.eoa}'
+                       f'{assistant}{self.eoa}\n'
             else:
                 ret += f'{self.user}:{user}{self.eoh}\n{self.assistant}:'
         return ret
 
     @property
     def stop_words(self):
         """Return the stop-words' token ids."""
-        return [103027, 103028]
+        return [103028]
 
 
+@MODELS.register_module(name='internlm-chat-20b')
 @MODELS.register_module(name='internlm-chat-7b-8k')
 class InternLMChat7B8K(InternLMChat7B):
+    """Chat template and generation parameters of InternLM-Chat-7B-8K and
+    InternLM-Chat-20B models."""
 
     def __init__(self, session_len=8192, **kwargs):
         super(InternLMChat7B8K, self).__init__(**kwargs)
         self.session_len = session_len
 
 
+@MODELS.register_module(name='internlm-20b')
+class InternLMBaseModel20B(BaseModel):
+    """Generation parameters of InternLM-20B-Base model."""
+
+    def __init__(self, session_len=4096, capability='completion', **kwargs):
+        super().__init__(session_len=session_len,
+                         capability=capability,
+                         **kwargs)
+
+
 @MODELS.register_module(name='baichuan-7b')
 class Baichuan7B(BaseModel):
+    """Generation parameters of Baichuan-7B base model."""
 
     def __init__(self, repetition_penalty=1.1, **kwargs):
         super().__init__(**kwargs)
@@ -252,6 +267,8 @@ def __init__(self, repetition_penalty=1.1, **kwargs):
 
 @MODELS.register_module(name='baichuan2-7b')
 class Baichuan2_7B(BaseModel):
+    """Chat template and generation parameters of Baichuan2-7B-Base and
+    Baichuan2-7B-Chat models."""
 
     def __init__(self,
                  temperature=0.3,

diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
@@ -7,7 +7,7 @@ def test_base_model():
     model = MODELS.get('llama')()
     assert model is not None
     assert model.capability == 'chat'
-    assert model.get_prompt('test') is None
+    assert model.get_prompt('test') == 'test'
     assert model.stop_words is None
 
     model = MODELS.get('internlm')(capability='completion')
@@ -72,7 +72,7 @@ def test_baichuan():
 
     model = MODELS.get('baichuan-7b')(capability='chat')
     _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+    assert _prompt == prompt
 
 
 def test_llama2():