Skip to content

Commit

Permalink
Merge branch 'main' into encode
Browse files Browse the repository at this point in the history
Conflicts:
	lmdeploy/model.py
  • Loading branch information
AllentDan committed Nov 3, 2023
2 parents 740b88b + c15fbf4 commit 8a555bd
Show file tree
Hide file tree
Showing 32 changed files with 2,037 additions and 1,150 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by

## Supported Models

`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`.
`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.

### TurboMind

Expand Down
2 changes: 1 addition & 1 deletion README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht

## 支持的模型

`LMDeploy` 支持 `TurboMind``Pytorch` 两种推理后端
`LMDeploy` 支持 `TurboMind``Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表

### TurboMind

Expand Down
34 changes: 31 additions & 3 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@ def convert(self,
model_name (str): The name of the to-be-deployed model, such as
llama-7b, llama-13b, vicuna-7b and etc.
model_path (str): The directory path of the model
model_format (str): The format of the model, fb or hf. 'fb' stands
for META's llama format, and 'hf' means huggingface format.
model_format (str): the format of the model, should choose from
['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
format, 'hf' means huggingface llama format, and 'awq' means
llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
the default value is None, which means the model_format will be
inferred based on model_name
tokenizer_path (str): The path of tokenizer model.
dst_path (str): The destination path that saves outputs.
tp (int): The number of GPUs used for tensor parallelism, which
Expand All @@ -38,7 +42,7 @@ def convert(self,
group_size (int): A parameter used in AWQ to quantize fp16 weights
to 4 bits.
"""
from lmdeploy.serve.turbomind.deploy import main as convert
from lmdeploy.turbomind.deploy.converter import main as convert

convert(model_name,
model_path,
Expand All @@ -49,6 +53,30 @@ def convert(self,
quant_path=quant_path,
group_size=group_size)

def list(self, engine: str = 'turbomind'):
"""List supported model names.
Examples 1:
lmdeploy list
Examples 2:
lmdeploy list --engine pytorch
Args:
engine (str): The backend for the model to run. Choice from
['turbomind', 'pytorch'].
"""
assert engine in ['turbomind', 'pytorch']
if engine == 'pytorch':
model_names = ['llama', 'llama2', 'internlm-7b']
elif engine == 'turbomind':
from lmdeploy.model import MODELS
model_names = list(MODELS.module_dict.keys())
model_names = [n for n in model_names if n.lower() not in ['base']]
model_names.sort()
print('Supported model names:')
print('\n'.join(model_names))


def run():
"""The entry point of running LMDeploy CLI."""
Expand Down
16 changes: 12 additions & 4 deletions lmdeploy/lite/apis/auto_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm',
}

Expand All @@ -40,6 +42,9 @@ def auto_awq(model: str,
hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
checkpoint = hf_config._name_or_path

# hard code for qwen, other configs do not have the `fp16` attribute.
hf_config.fp16 = True

with init_empty_weights():
# Load model
model = AutoModelForCausalLM.from_pretrained(model,
Expand All @@ -61,11 +66,14 @@ def auto_awq(model: str,
device_map[name] = 'cpu'
else:
device_map[name] = 0
load_checkpoint_in_model(model, checkpoint, device_map)
load_checkpoint_in_model(model,
checkpoint,
device_map,
dtype=torch.float16)

work_dir = Path(work_dir)

act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean']
act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
layers = collect_target_modules(model, layer_type)
fcs = {}
for l_name, layer in layers.items():
Expand Down
110 changes: 106 additions & 4 deletions lmdeploy/lite/apis/calibrate.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.

from pathlib import Path
from typing import Union

import torch
from accelerate import (infer_auto_device_map, init_empty_weights,
load_checkpoint_in_model)
from torch import nn
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from lmdeploy.lite.quantization import CalibrationContext
Expand All @@ -13,17 +15,90 @@
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm',
}


def _prepare_for_calibrate(model: nn.Module,
layer_type: Union[str, type],
head_name: str = 'lm_head',
device: str = 'cuda',
prefix: str = '') -> None:
"""Prepare the model for calibration by moving specific modules to CPU.
This function goes through each child of a given model and checks whether
it is an instance of a certain layer type or has the name equal to
`head_name`.
If yes, it moves the module to CPU, otherwise to the specified device
(default is CUDA).
If the child contains the target layer type in its sub-modules, the
function performs the same operation recursively.
Parameters
----------
model : nn.Module
The PyTorch model to prepare for calibration.
layer_type : Union[str, Type]
The type of the layer to be moved to CPU. Can be either a string of
class name or the class type itself.
head_name : str, optional
The name of the module to be moved to CPU. Default is 'lm_head'.
device : str, optional
The device to which modules not matching the `layer_type` or
`head_name` will be moved. Default is 'cuda'.
prefix : str, optional
The prefix used when printing the names of the moved modules.
Default is ''.
Raises
------
TypeError
If `layer_type` is neither a string nor a type.
"""

for name, child in model.named_children():

# Check if the child is an instance of the given layer type
if isinstance(layer_type, str):
is_layer = type(child).__name__ == layer_type
elif isinstance(layer_type, type):
is_layer = isinstance(child, layer_type)
else:
raise TypeError(
'layer_type should be a string (class name) or a type')

# Check if the child contains the target module type
contain_layer = len(
collect_target_modules(child, layer_type, [head_name]).keys()) > 0

# Check if the child matches the head name
is_head = name == head_name

mod_name = f'{prefix}.{name}' if prefix else name

# If the child is either an instance of the layer type or has the
# head name, move it to CPU, otherwise move it to the specified device
if is_layer or is_head:
child.to('cpu')
print(f'Move {mod_name} to CPU.')
elif contain_layer:
_prepare_for_calibrate(child, layer_type, head_name, device,
mod_name)
else:
child.to(device)
print(f'Move {mod_name} to GPU.')


def calibrate(model: str,
calib_dataset: str = 'c4',
calib_samples: int = 128,
Expand Down Expand Up @@ -54,16 +129,38 @@ def calibrate(model: str,
tokenizer = AutoTokenizer.from_pretrained(model,
use_fast=False,
trust_remote_code=True)
hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
hf_config = AutoConfig.from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
checkpoint = hf_config._name_or_path

# hard code for qwen, other configs do not have the `fp16` attribute.
hf_config.fp16 = True

with init_empty_weights():
# Load model
model = AutoModelForCausalLM.from_pretrained(model,
config=hf_config,
torch_dtype=torch.float16,
trust_remote_code=True)
model.config.use_cache = False

model_type = type(model).__name__
if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
raise RuntimeError(
f'Currently, quantification and calibration of {model_type} are '
f'not supported. The supported model types are '
f"{', '.join(LAYER_TYPE_MAP.keys())}.")

if model_type == 'QWenLMHeadModel':
try:
import flash_attn # noqa: F401
except ImportError:
raise RuntimeError(
'When using Qwen, you need to `pip install flash-attn` first, '
'otherwise calibration and quantification will not work '
'properly.')

layer_type = LAYER_TYPE_MAP[type(model).__name__]
norm_type = NORM_TYPE_MAP[type(model).__name__]

Expand All @@ -77,7 +174,12 @@ def calibrate(model: str,
device_map[name] = 'cpu'
else:
device_map[name] = 0
load_checkpoint_in_model(model, checkpoint, device_map)
load_checkpoint_in_model(model,
checkpoint,
device_map,
dtype=torch.float16)

_prepare_for_calibrate(model, layer_type, 'lm_head', device)

print('Loading calibrate dataset ...')
calib_loader, _ = get_calib_loaders(calib_dataset,
Expand Down
14 changes: 11 additions & 3 deletions lmdeploy/lite/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
'QWenBlock': {
'ln_1': ['attn.c_attn'],
'ln_2': ['mlp.w1', 'mlp.w2']
},
'DecoderLayer': {
'input_layernorm': ['self_attn.W_pack'],
'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
}
}

Expand All @@ -33,6 +37,10 @@
'QWenBlock': {
'attn.c_attn': ['attn.c_proj'],
'mlp.w1': ['mlp.c_proj']
},
'DecoderLayer': {
'self_attn.W_pack': ['self_attn.o_proj'],
'mlp.up_proj': ['mlp.down_proj']
}
}

Expand Down Expand Up @@ -69,7 +77,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
w_scales = get_weight_scale(concat_w, group_size)

scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()

ln.weight.div_(scales)
Expand Down Expand Up @@ -116,10 +124,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
w_scales = get_weight_scale(concat_w, group_size)

scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()

# (for qwen) pre_fc is packed QKV, only V needs to scale
# (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
if size_pre_fc > size_a and size_pre_fc % size_a == 0 \
and size_pre_fc // size_a == 3:

Expand Down
15 changes: 9 additions & 6 deletions lmdeploy/lite/quantization/weight/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
cal_qparams_per_tensor_minmax)
cal_qparams_per_tensor_minmax, precise_round)
from lmdeploy.lite.utils.global_avail import GlobalAvailMixin


Expand Down Expand Up @@ -119,8 +119,10 @@ def quant(self,
torch.Tensor: The fake quantized weight tensor.
"""

float_w = weight.float()

if qparams is None:
qparams = self.calculate_qparams(weight)
qparams = self.calculate_qparams(float_w)

scales = qparams.scales
zero_points = qparams.zero_points
Expand All @@ -133,17 +135,18 @@ def quant(self,
# per group scales shape: [out_c, in_c//group_size, 1]
if len(scales.shape) > 2:
# scales shape: [out_c, in_c//group_size, 1]
weight = weight.reshape(out_c, scales.shape[1], -1)
float_w = float_w.reshape(out_c, scales.shape[1], -1)

if zero_points is None:
assert self.symmetry
real_qweight = (weight / scales).round()
real_qweight = (float_w / scales).round()
fake_qweight = real_qweight * scales

else:
assert not self.symmetry

real_qweight = (weight / scales).round() + zero_points
real_qweight = precise_round(
(float_w - float_w.min(-1, keepdim=True)[0]) / scales)
fake_qweight = (real_qweight - zero_points) * scales

if len(scales.shape) > 2:
Expand All @@ -153,4 +156,4 @@ def quant(self,
if real:
return real_qweight.to(torch.int32)
else:
return fake_qweight
return fake_qweight.to(weight.dtype)
4 changes: 2 additions & 2 deletions lmdeploy/lite/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
cal_qparams_per_group_absmax,
cal_qparams_per_group_minmax,
cal_qparams_per_tensor_absmax,
cal_qparams_per_tensor_minmax)
cal_qparams_per_tensor_minmax, precise_round)
from .calib_dataloader import get_calib_loaders
from .collect import (bimap_name_mod, collect_target_modules,
collect_target_weights)
Expand All @@ -16,7 +16,7 @@
'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax',
'cal_qparams_per_group_absmax', 'cal_qparams_per_group_minmax',
'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax',
'QParams', 'get_calib_loaders', 'collect_target_modules',
'QParams', 'get_calib_loaders', 'collect_target_modules', 'precise_round',
'collect_target_weights', 'GlobalAvailMixin', 'split_decoder_layer_inputs',
'bimap_name_mod', 'concat_decoder_layer_outputs'
]
Loading

0 comments on commit 8a555bd

Please sign in to comment.