Merge branch 'main' into encode

Conflicts: lmdeploy/model.py
AllentDan · Nov 3, 2023 · 8a555bd · 8a555bd
2 parents 740b88b + c15fbf4
commit 8a555bd
Show file tree

Hide file tree

Showing 32 changed files with 2,037 additions and 1,150 deletions.
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 
 ## Supported Models
 
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`.
+`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
 
 ### TurboMind
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -53,7 +53,7 @@ LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](ht
 
 ## 支持的模型
 
-`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端
+`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表
 
 ### TurboMind
 

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -28,8 +28,12 @@ def convert(self,
             model_name (str): The name of the to-be-deployed model, such as
                 llama-7b, llama-13b, vicuna-7b and etc.
             model_path (str): The directory path of the model
-            model_format (str): The format of the model, fb or hf. 'fb' stands
-                for META's llama format, and 'hf' means huggingface format.
+            model_format (str): the format of the model, should choose from
+                ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+                format, 'hf' means huggingface llama format, and 'awq' means
+                llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+                the default value is None, which means the model_format will be
+                inferred based on model_name
             tokenizer_path (str): The path of tokenizer model.
             dst_path (str): The destination path that saves outputs.
             tp (int): The number of GPUs used for tensor parallelism, which
@@ -38,7 +42,7 @@ def convert(self,
             group_size (int): A parameter used in AWQ to quantize fp16 weights
                 to 4 bits.
         """
-        from lmdeploy.serve.turbomind.deploy import main as convert
+        from lmdeploy.turbomind.deploy.converter import main as convert
 
         convert(model_name,
                 model_path,
@@ -49,6 +53,30 @@ def convert(self,
                 quant_path=quant_path,
                 group_size=group_size)
 
+    def list(self, engine: str = 'turbomind'):
+        """List supported model names.
+
+        Examples 1:
+            lmdeploy list
+
+        Examples 2:
+            lmdeploy list --engine pytorch
+
+        Args:
+            engine (str): The backend for the model to run. Choice from
+                ['turbomind', 'pytorch'].
+        """
+        assert engine in ['turbomind', 'pytorch']
+        if engine == 'pytorch':
+            model_names = ['llama', 'llama2', 'internlm-7b']
+        elif engine == 'turbomind':
+            from lmdeploy.model import MODELS
+            model_names = list(MODELS.module_dict.keys())
+            model_names = [n for n in model_names if n.lower() not in ['base']]
+        model_names.sort()
+        print('Supported model names:')
+        print('\n'.join(model_names))
+
 
 def run():
     """The entry point of running LMDeploy CLI."""

diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -15,13 +15,15 @@
 LAYER_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMDecoderLayer',
     'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
+    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
+    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
 NORM_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMRMSNorm',
     'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
+    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
+    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
 }
 
@@ -40,6 +42,9 @@ def auto_awq(model: str,
     hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
     checkpoint = hf_config._name_or_path
 
+    # hard code for qwen, other configs do not have the `fp16` attribute.
+    hf_config.fp16 = True
+
     with init_empty_weights():
         # Load model
         model = AutoModelForCausalLM.from_pretrained(model,
@@ -61,11 +66,14 @@ def auto_awq(model: str,
             device_map[name] = 'cpu'
         else:
             device_map[name] = 0
-    load_checkpoint_in_model(model, checkpoint, device_map)
+    load_checkpoint_in_model(model,
+                             checkpoint,
+                             device_map,
+                             dtype=torch.float16)
 
     work_dir = Path(work_dir)
 
-    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmean']
+    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from pathlib import Path
+from typing import Union
 
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                         load_checkpoint_in_model)
+from torch import nn
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from lmdeploy.lite.quantization import CalibrationContext
@@ -13,17 +15,90 @@
 LAYER_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMDecoderLayer',
     'QWenLMHeadModel': 'QWenBlock',
-    'BaiChuanForCausalLM': 'DecoderLayer',
+    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
+    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
 NORM_TYPE_MAP = {
     'InternLMForCausalLM': 'InternLMRMSNorm',
     'QWenLMHeadModel': 'RMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',
+    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
+    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
 }
 
 
+def _prepare_for_calibrate(model: nn.Module,
+                           layer_type: Union[str, type],
+                           head_name: str = 'lm_head',
+                           device: str = 'cuda',
+                           prefix: str = '') -> None:
+    """Prepare the model for calibration by moving specific modules to CPU.
+
+    This function goes through each child of a given model and checks whether
+    it is an instance of a certain layer type or has the name equal to
+    `head_name`.
+    If yes, it moves the module to CPU, otherwise to the specified device
+    (default is CUDA).
+
+    If the child contains the target layer type in its sub-modules, the
+    function performs the same operation recursively.
+
+    Parameters
+    ----------
+    model : nn.Module
+        The PyTorch model to prepare for calibration.
+    layer_type : Union[str, Type]
+        The type of the layer to be moved to CPU. Can be either a string of
+        class name or the class type itself.
+    head_name : str, optional
+        The name of the module to be moved to CPU. Default is 'lm_head'.
+    device : str, optional
+        The device to which modules not matching the `layer_type` or
+        `head_name` will be moved. Default is 'cuda'.
+    prefix : str, optional
+        The prefix used when printing the names of the moved modules.
+        Default is ''.
+
+    Raises
+    ------
+    TypeError
+        If `layer_type` is neither a string nor a type.
+    """
+
+    for name, child in model.named_children():
+
+        # Check if the child is an instance of the given layer type
+        if isinstance(layer_type, str):
+            is_layer = type(child).__name__ == layer_type
+        elif isinstance(layer_type, type):
+            is_layer = isinstance(child, layer_type)
+        else:
+            raise TypeError(
+                'layer_type should be a string (class name) or a type')
+
+        # Check if the child contains the target module type
+        contain_layer = len(
+            collect_target_modules(child, layer_type, [head_name]).keys()) > 0
+
+        # Check if the child matches the head name
+        is_head = name == head_name
+
+        mod_name = f'{prefix}.{name}' if prefix else name
+
+        # If the child is either an instance of the layer type or has the
+        # head name, move it to CPU, otherwise move it to the specified device
+        if is_layer or is_head:
+            child.to('cpu')
+            print(f'Move {mod_name} to CPU.')
+        elif contain_layer:
+            _prepare_for_calibrate(child, layer_type, head_name, device,
+                                   mod_name)
+        else:
+            child.to(device)
+            print(f'Move {mod_name} to GPU.')
+
+
 def calibrate(model: str,
               calib_dataset: str = 'c4',
               calib_samples: int = 128,
@@ -54,16 +129,38 @@ def calibrate(model: str,
     tokenizer = AutoTokenizer.from_pretrained(model,
                                               use_fast=False,
                                               trust_remote_code=True)
-    hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model,
+                                           torch_dtype=torch.float16,
+                                           trust_remote_code=True)
     checkpoint = hf_config._name_or_path
 
+    # hard code for qwen, other configs do not have the `fp16` attribute.
+    hf_config.fp16 = True
+
     with init_empty_weights():
         # Load model
         model = AutoModelForCausalLM.from_pretrained(model,
+                                                     config=hf_config,
                                                      torch_dtype=torch.float16,
                                                      trust_remote_code=True)
         model.config.use_cache = False
 
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
+        raise RuntimeError(
+            f'Currently, quantification and calibration of {model_type} are '
+            f'not supported. The supported model types are '
+            f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+
+    if model_type == 'QWenLMHeadModel':
+        try:
+            import flash_attn  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                'When using Qwen, you need to `pip install flash-attn` first, '
+                'otherwise calibration and quantification will not work '
+                'properly.')
+
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     norm_type = NORM_TYPE_MAP[type(model).__name__]
 
@@ -77,7 +174,12 @@ def calibrate(model: str,
             device_map[name] = 'cpu'
         else:
             device_map[name] = 0
-    load_checkpoint_in_model(model, checkpoint, device_map)
+    load_checkpoint_in_model(model,
+                             checkpoint,
+                             device_map,
+                             dtype=torch.float16)
+
+    _prepare_for_calibrate(model, layer_type, 'lm_head', device)
 
     print('Loading calibrate dataset ...')
     calib_loader, _ = get_calib_loaders(calib_dataset,

diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
@@ -18,6 +18,10 @@
     'QWenBlock': {
         'ln_1': ['attn.c_attn'],
         'ln_2': ['mlp.w1', 'mlp.w2']
+    },
+    'DecoderLayer': {
+        'input_layernorm': ['self_attn.W_pack'],
+        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
     }
 }
 
@@ -33,6 +37,10 @@
     'QWenBlock': {
         'attn.c_attn': ['attn.c_proj'],
         'mlp.w1': ['mlp.c_proj']
+    },
+    'DecoderLayer': {
+        'self_attn.W_pack': ['self_attn.o_proj'],
+        'mlp.up_proj': ['mlp.down_proj']
     }
 }
 
@@ -69,7 +77,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
     ln.weight.div_(scales)
@@ -116,10 +124,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
-    # (for qwen) pre_fc is packed QKV, only V needs to scale
+    # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
     if size_pre_fc > size_a and size_pre_fc % size_a == 0 \
             and size_pre_fc // size_a == 3:
 

diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py
@@ -8,7 +8,7 @@
                                  cal_qparams_per_group_absmax,
                                  cal_qparams_per_group_minmax,
                                  cal_qparams_per_tensor_absmax,
-                                 cal_qparams_per_tensor_minmax)
+                                 cal_qparams_per_tensor_minmax, precise_round)
 from lmdeploy.lite.utils.global_avail import GlobalAvailMixin
 
 
@@ -119,8 +119,10 @@ def quant(self,
             torch.Tensor: The fake quantized weight tensor.
         """
 
+        float_w = weight.float()
+
         if qparams is None:
-            qparams = self.calculate_qparams(weight)
+            qparams = self.calculate_qparams(float_w)
 
         scales = qparams.scales
         zero_points = qparams.zero_points
@@ -133,17 +135,18 @@ def quant(self,
         # per group scales shape: [out_c, in_c//group_size, 1]
         if len(scales.shape) > 2:
             # scales shape: [out_c, in_c//group_size, 1]
-            weight = weight.reshape(out_c, scales.shape[1], -1)
+            float_w = float_w.reshape(out_c, scales.shape[1], -1)
 
         if zero_points is None:
             assert self.symmetry
-            real_qweight = (weight / scales).round()
+            real_qweight = (float_w / scales).round()
             fake_qweight = real_qweight * scales
 
         else:
             assert not self.symmetry
 
-            real_qweight = (weight / scales).round() + zero_points
+            real_qweight = precise_round(
+                (float_w - float_w.min(-1, keepdim=True)[0]) / scales)
             fake_qweight = (real_qweight - zero_points) * scales
 
         if len(scales.shape) > 2:
@@ -153,4 +156,4 @@ def quant(self,
         if real:
             return real_qweight.to(torch.int32)
         else:
-            return fake_qweight
+            return fake_qweight.to(weight.dtype)
diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
@@ -6,7 +6,7 @@
                           cal_qparams_per_group_absmax,
                           cal_qparams_per_group_minmax,
                           cal_qparams_per_tensor_absmax,
-                          cal_qparams_per_tensor_minmax)
+                          cal_qparams_per_tensor_minmax, precise_round)
 from .calib_dataloader import get_calib_loaders
 from .collect import (bimap_name_mod, collect_target_modules,
                       collect_target_weights)
@@ -16,7 +16,7 @@
     'cal_qparams_per_channel_absmax', 'cal_qparams_per_channel_minmax',
     'cal_qparams_per_group_absmax', 'cal_qparams_per_group_minmax',
     'cal_qparams_per_tensor_absmax', 'cal_qparams_per_tensor_minmax',
-    'QParams', 'get_calib_loaders', 'collect_target_modules',
+    'QParams', 'get_calib_loaders', 'collect_target_modules', 'precise_round',
     'collect_target_weights', 'GlobalAvailMixin', 'split_decoder_layer_inputs',
     'bimap_name_mod', 'concat_decoder_layer_outputs'
 ]