merge main

InternLM · Oct 17, 2023 · bd5ca4c · bd5ca4c
2 parents ec9ba00 + bb3cce9
commit bd5ca4c
Show file tree

Hide file tree

Showing 72 changed files with 1,575 additions and 485 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -24,13 +24,18 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
-      - name: Check disk space
-        run: |
-          rm -rf ${GITHUB_WORKSPACE}/.git
-          cat /proc/cpuinfo  | grep -ic proc
-          free
-          df -h
-          df . -h
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Get docker info
         run: |
           docker info

diff --git a/.github/workflows/linux-x64-gpu.yml b/.github/workflows/linux-x64-gpu.yml
@@ -27,14 +27,30 @@ permissions:
 jobs:
   cuda-118:
     runs-on: ubuntu-latest
-    container: openmmlab/lmdeploy-builder:cuda11.8
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build
-        run: |
-          source /opt/conda/bin/activate
-          conda activate py38
-          mkdir build && cd build
-          bash ../generate.sh
-          make -j$(nproc) && make install
+        uses: addnab/docker-run-action@v3
+        with:
+          image: openmmlab/lmdeploy-builder:cuda11.8
+          options: -v ${{ github.workspace }}:/work --cpus=1.8
+          run: |
+            cd /work
+            source /opt/conda/bin/activate
+            conda activate py38
+            mkdir build && cd build
+            bash ../generate.sh
+            make -j$(nproc) && make install
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -21,6 +21,18 @@ jobs:
       DOCKER_TAG: cuda11.8
       OUTPUT_FOLDER: cuda11.8_dist
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
         uses: actions/checkout@v3
       - name: Build

diff --git a/.readthedocs.yml → .readthedocs.yaml b/.readthedocs.yml → .readthedocs.yaml
diff --git a/README.md b/README.md
@@ -20,6 +20,10 @@ ______________________________________________________________________
 
 ## News 🎉
 
+- \[2023/09\] TurboMind supports Qwen-14B
+- \[2023/09\] TurboMind supports InternLM-20B
+- \[2023/09\] TurboMind supports all features of Code Llama: code completion, infilling, chat / instruct, and python specialist. Click [here](./docs/en/supported_models/codellama.md) for deployment guide
+- \[2023/09\] TurboMind supports Baichuan2-7B
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
 - \[2023/08\] TurboMind supports Windows (tp=1)
@@ -55,19 +59,25 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 > **Note**<br />
 > W4A16 inference requires Nvidia GPU with Ampere architecture or above.
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-7B    |       Yes       | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |       Yes       | Yes  |   Yes   |  No   |  No  |
+| Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2-7B |       Yes       | Yes  |   No    |  No   |  No  |
+|  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|  Models  | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :-------------: | :--: | :-----: | :---: | :--: |
-|  Llama   |       Yes       | Yes  |   No    |  No   |  No  |
-|  Llama2  |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM |       Yes       | Yes  |   No    |  No   |  No  |
+|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
+|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
+|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
+| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
 
 ## Performance
 
@@ -101,7 +111,7 @@ pip install lmdeploy
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internlm-chat-7b
 
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:
@@ -226,7 +236,7 @@ LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight
 [Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8.
 
 > **Warning**<br />
-> runtime Tensor Parallel for quantilized model is not available. Please setup `--tp` on `deploy` to enable static TP.
+> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
 
 ## Contributing
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -20,6 +20,10 @@ ______________________________________________________________________
 
 ## 更新 🎉
 
+- \[2023/09\] TurboMind 支持 Qwen-14B
+- \[2023/09\] TurboMind 支持 InternLM-20B 模型
+- \[2023/09\] TurboMind 支持 Code Llama 所有功能：代码续写、填空、对话、Python专项。点击[这里](./docs/zh_cn/supported_models/codellama.md)阅读部署方法
+- \[2023/09\] TurboMind 支持 Baichuan2-7B
 - \[2023/08\] TurboMind 支持 flash-attention2
 - \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
 - \[2023/08\] TurboMind 支持 Windows (tp=1)
@@ -56,19 +60,25 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 > **Note**<br />
 > W4A16 推理需要 Ampere 及以上架构的 Nvidia GPU
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|  Llama2  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| InternLM |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|     模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :----------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+|   QWen-7B    |   Yes    | Yes  |   Yes   |  No   |  No  |
+|   QWen-14B   |   Yes    | Yes  |   Yes   |  No   |  No  |
+| Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
+| Baichuan2-7B |   Yes    | Yes  |   No    |  No   |  No  |
+|  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
 
 ### Pytorch
 
-|   模型   | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
-| :------: | :------: | :--: | :-----: | :---: | :--: |
-|  Llama   |   Yes    | Yes  |   No    |  No   |  No  |
-|  Llama2  |   Yes    | Yes  |   No    |  No   |  No  |
-| InternLM |   Yes    | Yes  |   No    |  No   |  No  |
+|    模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+| :---------: | :------: | :--: | :-----: | :---: | :--: |
+|    Llama    |   Yes    | Yes  |   No    |  No   |  No  |
+|   Llama2    |   Yes    | Yes  |   No    |  No   |  No  |
+| InternLM-7B |   Yes    | Yes  |   No    |  No   |  No  |
 
 ## 性能
 
@@ -102,7 +112,7 @@ pip install lmdeploy
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
-git clone https://huggingface.co/internlm/internlm-chat-7b /path/to/internlm-chat-7b
+git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internlm-chat-7b
 
 # if you want to clone without large files – just their pointers
 # prepend your git clone with the following env var:

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -23,10 +23,14 @@ python profile_throughput.py \
 
 `profile_generation.py` perform benchmark with dummy data.
 
+```shell
+pip install nvidia-ml-py
+```
+
 ```bash
 python profile_generation.py \
- /path/to/your/model \
- --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+ --model-path /path/to/your/model \
+ --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
 ```
 
 ## profile serving
@@ -38,7 +42,21 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 python profile_serving.py \
     ${TritonServerAddress} \
-    /path/to/tokenizer \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
+    ShareGPT_V3_unfiltered_cleaned_split.json \
+    --concurrency 64
+```
+
+## profile restful api
+
+`profile_restful_api.py` is used to do benchmark on api server.
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python profile_restful_api.py \
+    ${ServerAddress} \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
     ShareGPT_V3_unfiltered_cleaned_split.json \
     --concurrency 64
 ```