From 9565505989c911f996fea646bc945cf8f276532f Mon Sep 17 00:00:00 2001
From: AllentDan <41138331+AllentDan@users.noreply.github.com>
Date: Wed, 25 Dec 2024 12:21:01 +0800
Subject: [PATCH] Support torch_dtype modification and update FAQs for AWQ
 quantization (#2898)

* Support torch_dtype modification and update FAQs for AWQ quantization

* fix lint

* add clamp-zeros option

* add guidance

* datasets proxy

* remove clamp_zeros

* fix comments

* print

* fix ut hf-token

---------

Co-authored-by: RunningLeon <mnsheng@yeah.net>
---
 .github/workflows/unit-test.yml           |  1 -
 docs/en/quantization/w4a16.md             |  4 ++++
 docs/zh_cn/quantization/w4a16.md          |  5 +++++
 lmdeploy/cli/lite.py                      |  4 ++++
 lmdeploy/cli/utils.py                     |  2 +-
 lmdeploy/lite/apis/auto_awq.py            |  8 ++++---
 lmdeploy/lite/apis/calibrate.py           | 12 +++++++---
 lmdeploy/lite/apis/gptq.py                | 17 ++++++++++----
 lmdeploy/lite/apis/smooth_quant.py        | 13 +++++++++--
 lmdeploy/lite/quantization/awq.py         | 27 ++++++++++++++++++-----
 lmdeploy/lite/quantization/calibration.py | 14 +++++++++---
 lmdeploy/lite/utils/load.py               | 18 ++++++++++-----
 12 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 14c865e801..3a459050ec 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -91,7 +91,6 @@ jobs:
           echo "TODO"
       - name: Test lmdeploy python UT
         run: |
-          huggingface-cli login --token ${{ secrets.HF_TOKEN }}
           coverage run --branch --source lmdeploy -m pytest -rsE tests
           coverage xml
           coverage report -m
diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md
index 0aa1e17a5b..c36c3736c6 100644
--- a/docs/en/quantization/w4a16.md
+++ b/docs/en/quantization/w4a16.md
@@ -128,3 +128,7 @@ We benchmarked the Llama-2-7B-chat and Llama-2-13B-chat models with 4-bit quanti
 | ---------------- | ------- | ------- | --------- |
 | Llama-2-7B-chat  | 112.9   | 159.4   | 206.4     |
 | Llama-2-13B-chat | N/A     | 90.7    | 115.8     |
+
+## FAQs
+
+1. Out of Memory error during quantization due to insufficient GPU memory: This can be addressed by reducing the parameter `--calib-seqlen`, increasing the parameter `--calib-samples`, and set `--batch-size` to 1.
diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md
index d69a8a23d2..3cea164dd9 100644
--- a/docs/zh_cn/quantization/w4a16.md
+++ b/docs/zh_cn/quantization/w4a16.md
@@ -131,3 +131,8 @@ lmdeploy serve api_client http://0.0.0.0:23333
 | ---------------- | ------- | ------- | --------- |
 | Llama-2-7B-chat  | 112.9   | 159.4   | 206.4     |
 | Llama-2-13B-chat | N/A     | 90.7    | 115.8     |
+
+## 快速问答
+
+1. 量化时出现 Out of Memory 显存不够：可以通过减小传参 `--calib-seqlen`，增大传参 `--calib-samples`，并使用 `--batch-size` 为 1。
+2. 量化时，无法链接huggingface并下载数据集。可以尝试使用镜像，`export HF_ENDPOINT=https://hf-mirror.com`。
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index d76d6a5f34..236e022b34 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -35,6 +35,7 @@ def add_parser_auto_awq():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
+        ArgumentHelper.dtype(parser)
         parser.add_argument(
             '--device',
             type=str,
@@ -71,6 +72,7 @@ def add_parser_auto_gptq():
         ArgumentHelper.calib_samples(parser)
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
+        ArgumentHelper.dtype(parser)
         parser.add_argument('--w-bits',
                             type=int,
                             default=4,
@@ -99,6 +101,7 @@ def add_parser_calibrate():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
+        ArgumentHelper.dtype(parser)
 
     @staticmethod
     def add_parser_smooth_quant():
@@ -122,6 +125,7 @@ def add_parser_smooth_quant():
         ArgumentHelper.calib_seqlen(parser)
         ArgumentHelper.calib_batchsize(parser)
         ArgumentHelper.calib_search_scale(parser)
+        ArgumentHelper.dtype(parser)
 
     @staticmethod
     def auto_awq(args):
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 85784a58f5..cf7b6526ec 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -354,7 +354,7 @@ def calib_batchsize(parser):
 
     @staticmethod
     def calib_search_scale(parser):
-        """Add argument batch_size to parser."""
+        """Add argument search_scale to parser."""
 
         return parser.add_argument(
             '--search-scale',
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index c41b28fd6e..2c84612839 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,6 +2,7 @@
 import os
 import os.path as osp
 import shutil
+from typing import Literal
 
 import torch
 from torch import nn
@@ -12,9 +13,7 @@
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.pytorch.check_env import try_import_deeplink
 
-from .calibrate import LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate
-
-NORM_TYPE_MAP = NORM_TYPE_MAP  # legacy
+from .calibrate import LAYER_TYPE_MAP, calibrate
 
 
 def save_vl_model(vl_model, model_path, dst_path):
@@ -56,6 +55,7 @@ def auto_awq(model: str,
              search_scale: bool = False,
              device: str = 'cuda',
              revision: str = None,
+             dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
              download_dir: str = None):
     """Perform weight quantization using AWQ algorithm.
 
@@ -77,6 +77,7 @@ def auto_awq(model: str,
         revision (str): The specific model version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified,
             will use the default version.
+        dtype (str): Data type for loading model weights and calib infer.
         download_dir (str): Directory to download and load the weights,
             default to the default cache directory of huggingface.
     """
@@ -96,6 +97,7 @@ def auto_awq(model: str,
                                                      w_bits=w_bits,
                                                      w_group_size=w_group_size,
                                                      search_scale=search_scale,
+                                                     dtype=dtype,
                                                      batch_size=batch_size)
 
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 4759e95b4f..307cf6d7e9 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from pathlib import Path
-from typing import Union
+from typing import Literal, Union
 
 import torch
 from torch import nn
@@ -205,6 +205,7 @@ def calibrate(model: str,
               w_bits: int = 4,
               w_group_size: int = 128,
               search_scale: bool = False,
+              dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
               batch_size: int = 1) -> None:
     """The main function for loading the model and performing calibration on a
     given dataset.
@@ -225,6 +226,7 @@ def calibrate(model: str,
         w_group_size (int): Group size for weight quantization statistics.
         search_scale (bool): Whether search scale ratio. Default to False,
             which means only smooth quant with 0.5 ratio will be applied.
+        dtype (str): Data type for loading model weights and calib infer.
         batch_size (int): The batch size for running the calib samples.
             Low GPU mem requires small batch_size. Large batch_size
             reduces the calibration time while costs more VRAM.
@@ -246,7 +248,7 @@ def calibrate(model: str,
 
     if model_type == 'llm':
         model = load_hf_from_pretrained(model,
-                                        torch_dtype=torch.float16,
+                                        dtype=dtype,
                                         trust_remote_code=True)
         vl_model = None
     elif model_type == 'vlm':
@@ -257,7 +259,11 @@ def calibrate(model: str,
         if hasattr(vl_model, 'llm'):  # MiniCPMV
             model = vl_model.llm
         model.config.use_cache = False
-        model.half().eval()
+        if dtype == 'float16':
+            model.half()
+        elif dtype == 'bfloat16':
+            model.to(torch.bfloat16)
+        model.eval()
 
     model_type = type(model).__name__
     if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
diff --git a/lmdeploy/lite/apis/gptq.py b/lmdeploy/lite/apis/gptq.py
index 12b88a52cd..eb4418a533 100644
--- a/lmdeploy/lite/apis/gptq.py
+++ b/lmdeploy/lite/apis/gptq.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
+from typing import Literal
 
 import torch
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from lmdeploy.lite.utils.calib_dataloader import get_calib_loaders
 
@@ -15,6 +16,7 @@ def auto_gptq(model: str,
               calib_samples: int = 128,
               calib_seqlen: int = 2048,
               batch_size: int = 1,
+              dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
               revision: str = None):
     """Perform weight quantization using AWQ algorithm.
 
@@ -29,9 +31,7 @@ def auto_gptq(model: str,
         calib_seqlen (int): The sequence length for calibration.
         w_bits (int): Bit number for weight quantization.
         w_group_size (int): Group size for weight quantization statistics.
-        search_scale (bool): Whether search scale ratio. Default to False,
-            which means only smooth quant with 0.5 ratio will be applied.
-        device (str): Device type of running.
+        dtype (str): Data type for loading model weights and calib infer.
         revision (str): The specific model version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified,
             will use the default version.
@@ -83,9 +83,18 @@ def auto_gptq(model: str,
 
     # load un-quantized model, by default,
     # the model will always be loaded into CPU memory
+    hf_config = AutoConfig.from_pretrained(pretrained_model_dir,
+                                           revision=revision,
+                                           trust_remote_code=True)
+    torch_dtype = getattr(hf_config, 'torch_dtype', torch.float16)
+    if dtype == 'float16':
+        torch_dtype = torch.float16
+    elif dtype == 'bfloat16':
+        torch_dtype = torch.bfloat16
     model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir,
                                                 quantize_config,
                                                 revision=revision,
+                                                torch_dtype=torch_dtype,
                                                 trust_remote_code=True)
 
     # quantize model, the examples should be list of dict whose keys
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
index c8df67355e..188eedbd0e 100644
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Literal
+
 import fire
 import torch
 from torch import nn
@@ -6,7 +9,8 @@
 from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP,
                                           calibrate)
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
-                                            awq_layers, smooth_layers)
+                                            awq_layers, skipped_module,
+                                            smooth_layers)
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.pytorch.models import QLinear, QRMSNorm
 
@@ -19,8 +23,8 @@ def smooth_quant(model: str,
                  search_scale: bool = False,
                  batch_size: int = 1,
                  w_bits: int = 8,
+                 dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
                  device: str = 'cuda'):
-
     model_path = model
     vl_model, model, tokenizer, work_dir = calibrate(model,
                                                      calib_dataset,
@@ -31,6 +35,7 @@ def smooth_quant(model: str,
                                                      w_bits=w_bits,
                                                      w_group_size=-1,
                                                      search_scale=search_scale,
+                                                     dtype=dtype,
                                                      batch_size=batch_size)
 
     # calibrate function exports the calibration statistics
@@ -76,6 +81,8 @@ def smooth_quant(model: str,
     rmsnorms = collect_target_modules(model, norm_type)
 
     for name, linear in fcs.items():
+        if skipped_module(name):
+            continue
         linear.to(device)
         q_linear = QLinear.from_float(linear)
         parent_name, _, child_name = name.rpartition('.')
@@ -84,6 +91,8 @@ def smooth_quant(model: str,
         linear.to('cpu')
 
     for name, norm in rmsnorms.items():
+        if skipped_module(name):
+            continue
         norm.to(device)
         q_norm = QRMSNorm.from_float(norm)
         parent_name, _, child_name = name.rpartition('.')
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index cf03a75216..3e24a13cc3 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -43,8 +43,10 @@
     'MixtralDecoderLayer': {
         'input_layernorm':
         ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
-        'post_attention_layernorm':
-        ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
+        'post_attention_layernorm': [
+            'block_sparse_moe.gate', 'block_sparse_moe.experts.{i}.w1',
+            'block_sparse_moe.experts.{i}.w3'
+        ]
     },
     'Qwen2VLDecoderLayer': {
         'input_layernorm':
@@ -120,7 +122,12 @@ def get_weight_scale(weight, q_group_size=-1):
     org_shape = weight.shape
     if q_group_size > 0:
         weight = weight.view(-1, q_group_size)
-    scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
+    abs_weight = weight.abs()
+    abs_weight_amax = abs_weight.amax(dim=1, keepdim=True)
+    if abs_weight_amax.min().item() == 0:
+        print('weight.amax.min is zero, clamping weight.amax to 1e-4')
+        abs_weight_amax = abs_weight_amax.clamp(min=1e-4)
+    scale = abs_weight / abs_weight_amax
     scale = scale.view(org_shape)
     scale = scale.mean(0)
     return scale
@@ -153,8 +160,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
     concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
     w_scales = get_weight_scale(concat_w, group_size)
 
+    w_scales_pow = w_scales.pow(1 - alpha)
+    if w_scales_pow.min().item() == 0:
+        print('w_scales.pow(1 - alpha).min is zero, '
+              'clamping w_scales.pow(1 - alpha) to 1e-4')
+        w_scales_pow = w_scales_pow.clamp(min=1e-4)
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
 
     scales = scales / (scales[nonzero_positions].max() *
                        scales[nonzero_positions].min()).sqrt()
@@ -204,8 +216,13 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
     concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
     w_scales = get_weight_scale(concat_w, group_size)
 
+    w_scales_pow = w_scales.pow(1 - alpha)
+    if w_scales_pow.min().item() == 0:
+        print('w_scales.pow(1 - alpha).min is zero, '
+              'clamping w_scales.pow(1 - alpha) to 1e-4')
+        w_scales_pow = w_scales_pow.clamp(min=1e-4)
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
+              w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
     # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index e590f1a4eb..1df8f2c740 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -42,6 +42,9 @@ def __init__(self,
             tokenizer (PreTrainedTokenizer): Tokenizer of the given model.
             layer_type (Union[str, type]): Type of the layers to be observed.
             norm_type (Union[str, type]): Norm type used in the model.
+            batch_size (int): The batch size for running the calib samples.
+                Low GPU mem requires small batch_size. Large batch_size
+                reduces the calibration time while costs more VRAM.
             device (str, optional): Device where the model should run.
                 Defaults to 'cuda'.
         """
@@ -290,9 +293,14 @@ def _search_module_scale(block, linears2scale: list, x, kwargs={}):
 
         org_sd = {k: v.cpu() for k, v in block.state_dict().items()}
         for ratio in range(0, n_grid):
-            ratio = ratio * 1 / n_grid
-            scales = (x_max.pow(ratio) /
-                      w_mean.pow(1 - ratio)).clamp(min=1e-4).view(-1)
+            ratio = ratio / n_grid
+            w_mean_pow = w_mean.pow(1 - ratio)
+            if w_mean_pow.min().item() == 0:
+                print('w_mean.pow(1 - ratio).min is zero, '
+                      'clamping w_mean.pow(1 - ratio) to 1e-4')
+                w_mean_pow = w_mean_pow.clamp(min=1e-4)
+            scales = (x_max.pow(ratio) / w_mean_pow).clamp(min=1e-4).view(-1)
+
             scales = scales / (scales.max() * scales.min()).sqrt()
             for fc in linears2scale:
                 fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
diff --git a/lmdeploy/lite/utils/load.py b/lmdeploy/lite/utils/load.py
index bfd306a743..170c149778 100644
--- a/lmdeploy/lite/utils/load.py
+++ b/lmdeploy/lite/utils/load.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from typing import Literal
+
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
 
@@ -7,8 +9,8 @@
 
 
 def load_hf_from_pretrained(pretrained_model_name_or_path,
-                            dtype=torch.float16,
-                            **kwargs):
+                            dtype: Literal['float16', 'bfloat16',
+                                           'auto'], **kwargs):
 
     if dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
         raise RuntimeError('Your device does not supports bf16(bfloat16), '
@@ -21,10 +23,14 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
                                            trust_remote_code=True)
 
     # HACK hard code for qwen, other configs do not have the `fp16` attribute.
-    if dtype == torch.float16:
-        hf_config.fp16 = True
-    elif dtype == torch.bfloat16:
-        hf_config.bf16 = True
+    if hasattr(hf_config, 'fp16') or hasattr(hf_config, 'bf16'):
+        if dtype == 'bfloat16':
+            hf_config.bf16 = True
+        else:
+            hf_config.fp16 = True
+
+    if dtype != 'auto':
+        setattr(hf_config, 'torch_dtype', dtype)
 
     with LoadNoInit():
         # Load model