Skip to content

Commit

Permalink
Merge branch 'InternLM:main' into internlm3-awq
Browse files Browse the repository at this point in the history
  • Loading branch information
AllentDan authored Jan 14, 2025
2 parents 73aa78c + 4ac1894 commit 19bba40
Show file tree
Hide file tree
Showing 162 changed files with 8,432 additions and 4,484 deletions.
72 changes: 37 additions & 35 deletions .github/scripts/eval_chat_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from copy import deepcopy

from mmengine.config import read_base
from opencompass.models import TurboMindModel, TurboMindModelwithChatTemplate
from opencompass.models import TurboMindModelwithChatTemplate

with read_base():
# choose a list of datasets
Expand Down Expand Up @@ -84,6 +84,8 @@
models as hf_mistral_chat_7b # noqa: F401, E501
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
models as hf_mixtral_chat_8x7b # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
models as lmdeploy_qwen2_5_7b_instruct # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import \
models as hf_qwen1_5_chat_7b # noqa: F401, E501
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import \
Expand Down Expand Up @@ -146,10 +148,8 @@
turbomind_internlm2_5_7b_chat_4bits = deepcopy(*lmdeploy_internlm2_5_7b_chat)
turbomind_internlm2_5_7b_chat_kvint4 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
turbomind_internlm2_5_7b_chat_kvint8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
turbomind_internlm2_5_7b_chat_batch1 = deepcopy(*lmdeploy_internlm2_5_7b_chat)
turbomind_internlm2_5_7b_chat_batch1_4bits = deepcopy(
*lmdeploy_internlm2_5_7b_chat)
pytorch_internlm2_5_7b_chat = deepcopy(*lmdeploy_internlm2_5_7b_chat)
pytorch_internlm2_5_7b_chat_w8a8 = deepcopy(*lmdeploy_internlm2_5_7b_chat)

# ===== Configs for internlm/internlm2_5_20b_chat =====
turbomind_internlm2_5_20b_chat = deepcopy(*lmdeploy_internlm2_5_20b_chat)
Expand Down Expand Up @@ -181,26 +181,6 @@
turbomind_qwen_7b_chat_kvint8 = deepcopy(*lmdeploy_qwen_7b_chat)
pytorch_qwen_7b_chat = deepcopy(*lmdeploy_qwen_7b_chat)

# ===== Configs for meta-llama/Llama-2-7b-chat-hf =====
turbomind_llama2_7b_chat = dict(type=TurboMindModel,
abbr='tb_llama2_chat_7b',
path='meta-llama/Llama-2-7b-chat-hf',
engine_config=dict(session_len=MAX_SESSION_LEN,
max_batch_size=128),
gen_config=dict(top_k=1,
top_p=0.8,
temperature=1.0,
max_new_tokens=MAX_NEW_TOKENS),
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=128,
meta_template=llama2_meta_template,
run_cfg=dict(num_gpus=1),
end_str='[INST]')
turbomind_llama2_7b_chat_4bits = deepcopy(turbomind_llama2_7b_chat)
turbomind_llama2_7b_chat_kvint4 = deepcopy(turbomind_llama2_7b_chat)
turbomind_llama2_7b_chat_kvint8 = deepcopy(turbomind_llama2_7b_chat)

# ===== Configs for meta-llama/Meta-Llama-3-8B-Instruct =====
turbomind_llama3_8b_instruct = deepcopy(*lmdeploy_llama3_8b_instruct)
turbomind_llama3_8b_instruct_4bits = deepcopy(*lmdeploy_llama3_8b_instruct)
Expand All @@ -218,24 +198,44 @@
turbomind_llama3_1_8b_instruct_kvint8 = deepcopy(
turbomind_llama3_1_8b_instruct)
pytorch_llama3_1_8b_instruct = deepcopy(turbomind_llama3_1_8b_instruct)
pytorch_llama3_1_8b_instruct_w8a8 = deepcopy(turbomind_llama3_1_8b_instruct)

# ===== Configs for Qwen/Qwen2-7B-Instruct =====
turbomind_qwen2_7b_instruct = deepcopy(*lmdeploy_qwen2_7b_instruct)
turbomind_qwen2_7b_instruct_4bits = deepcopy(*lmdeploy_qwen2_7b_instruct)
turbomind_qwen2_7b_instruct_kvint4 = deepcopy(*lmdeploy_qwen2_7b_instruct)
turbomind_qwen2_7b_instruct_kvint8 = deepcopy(*lmdeploy_qwen2_7b_instruct)
pytorch_qwen2_7b_instruct = deepcopy(*lmdeploy_qwen2_7b_instruct)
pytorch_qwen2_7b_instruct_w8a8 = deepcopy(*lmdeploy_qwen2_7b_instruct)

# ===== Configs for Qwen/Qwen25-7B-Instruct =====
turbomind_qwen2_5_7b_instruct = deepcopy(*lmdeploy_qwen2_5_7b_instruct)
turbomind_qwen2_5_7b_instruct_4bits = deepcopy(*lmdeploy_qwen2_5_7b_instruct)
turbomind_qwen2_5_7b_instruct_kvint4 = deepcopy(*lmdeploy_qwen2_5_7b_instruct)
turbomind_qwen2_5_7b_instruct_kvint8 = deepcopy(*lmdeploy_qwen2_5_7b_instruct)
pytorch_qwen2_5_7b_instruct = deepcopy(*lmdeploy_qwen2_5_7b_instruct)
pytorch_qwen2_5_7b_instruct_w8a8 = deepcopy(*lmdeploy_qwen2_5_7b_instruct)

# ===== Configs for meta-llama/Llama-2-7b-chat-hf =====
turbomind_llama2_7b_chat = deepcopy(*lmdeploy_llama2_7b_chat)
turbomind_llama2_7b_chat_4bits = deepcopy(*lmdeploy_llama2_7b_chat)
turbomind_llama2_7b_chat_kvint4 = deepcopy(*lmdeploy_llama2_7b_chat)
turbomind_llama2_7b_chat_kvint8 = deepcopy(*lmdeploy_llama2_7b_chat)

for model in [v for k, v in locals().items() if k.startswith('turbomind_')]:
model['engine_config']['max_batch_size'] = 128
model['engine_config']['max_batch_size'] = 1
model['gen_config']['do_sample'] = False
model['batch_size'] = 128
model['batch_size'] = 100

for model in [v for k, v in locals().items() if k.endswith('_4bits')]:
model['engine_config']['model_format'] = 'awq'
model['abbr'] = model['abbr'] + '_4bits'
model['path'] = model['path'] + '-inner-4bits'

for model in [v for k, v in locals().items() if k.endswith('_w8a8')]:
model['abbr'] = model['abbr'] + '_w8a8'
model['path'] = model['path'] + '-inner-w8a8'

for model in [v for k, v in locals().items() if k.endswith('_kvint4')]:
model['engine_config']['quant_policy'] = 4
model['abbr'] = model['abbr'] + '_kvint4'
Expand All @@ -247,24 +247,19 @@
for model in [v for k, v in locals().items() if k.startswith('pytorch_')]:
model['abbr'] = model['abbr'].replace('turbomind', 'pytorch')
model['backend'] = 'pytorch'
model['engine_config']['max_batch_size'] = 64
model['gen_config']['do_sample'] = False
model['batch_size'] = 64

for model in [v for k, v in locals().items() if '_batch1' in k]:
model['abbr'] = model['abbr'] + '_batch1'
model['engine_config']['max_batch_size'] = 1
model['batch_size'] = 1
model['gen_config']['do_sample'] = False
model['batch_size'] = 100

basic_pytorch_chat_tp1 = dict(type=TurboMindModelwithChatTemplate,
engine_config=dict(session_len=MAX_SESSION_LEN,
max_batch_size=64,
max_batch_size=1,
tp=1),
gen_config=dict(do_sample=False,
max_new_tokens=MAX_NEW_TOKENS),
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=64,
batch_size=100,
run_cfg=dict(num_gpus=1))

# ===== Configs for Qwen/Qwen1.5-MoE-A2.7B-Chat =====
Expand All @@ -277,6 +272,13 @@
pytorch_gemma_2_9b_it['abbr'] = 'pytorch_gemma_2_9b_it'
pytorch_gemma_2_9b_it['path'] = 'google/gemma-2-9b-it'

# ===== Configs for google/gemma2-27b-it =====
pytorch_gemma_2_27b_it = deepcopy(basic_pytorch_chat_tp1)
pytorch_gemma_2_27b_it['abbr'] = 'pytorch_gemma_2_27b_it'
pytorch_gemma_2_27b_it['path'] = 'google/gemma-2-27b-it'
pytorch_gemma_2_27b_it['run_cfg']['num_gpus'] = 2
pytorch_gemma_2_27b_it['engine_config']['tp'] = 2

race_datasets = [race_datasets[1]]

# Summarizer
Expand Down
16 changes: 10 additions & 6 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Clone repository
uses: actions/checkout@v3
uses: actions/checkout@v2
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
with:
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
Expand All @@ -105,10 +105,8 @@ jobs:
run: |
# manually install flash attn
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
python3 -m pip install -e /root/packages/AutoAWQ_kernels
python3 -m pip install /root/packages/autoawq-0.2.6-cp310-cp310-manylinux2014_x86_64.whl --no-deps
python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
python3 -m pip install /root/packages/flash_attn-*.whl
python3 -m pip install /root/packages/xformers-*.whl --no-deps
python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
- name: Install lmdeploy
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
Expand Down Expand Up @@ -148,9 +146,15 @@ jobs:
needs: [benchmark]
timeout-minutes: 5
runs-on: [self-hosted, linux-a100]
container:
image: openmmlab/lmdeploy:latest-cu11
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
volumes:
- /nvme/qa_test_models:/nvme/qa_test_models
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Clone repository
uses: actions/checkout@v3
uses: actions/checkout@v2
with:
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
Expand Down
Loading

0 comments on commit 19bba40

Please sign in to comment.