From f812587c91a3eefe5e0ef29130dc3a958a2ace62 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Thu, 16 May 2024 18:00:05 +0800
Subject: [PATCH] fix

---
 lmdeploy/legacy/pytorch/modules/linear.py     |  8 +++++--
 lmdeploy/lite/apis/auto_awq.py                |  3 ++-
 .../lite/quantization/activation/observer.py  | 13 +++++++++++
 lmdeploy/lite/quantization/awq.py             | 22 ++++++++++++++-----
 lmdeploy/lite/quantization/calibration.py     |  2 ++
 5 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/lmdeploy/legacy/pytorch/modules/linear.py b/lmdeploy/legacy/pytorch/modules/linear.py
index 218a36407e..3a06e69799 100644
--- a/lmdeploy/legacy/pytorch/modules/linear.py
+++ b/lmdeploy/legacy/pytorch/modules/linear.py
@@ -4,6 +4,8 @@
 import torch
 from torch import nn
 
+from lmdeploy.lite.utils.cal_qparams import QParams
+
 try:
     import awq_inference_engine
 except ModuleNotFoundError:
@@ -72,7 +74,8 @@ def __init__(
     def from_linear(cls: Type['WeightOnlyQLinear'],
                     linear: nn.Linear,
                     quantizer: TypeVar('Quantizer'),
-                    awq_layout: bool = True) -> 'WeightOnlyQLinear':
+                    awq_layout: bool = True,
+                    qparams: Optional[QParams] = None) -> 'WeightOnlyQLinear':
         """Create a WeightOnlyQLinear object from a PyTorch Linear object.
 
         Args:
@@ -103,7 +106,8 @@ def from_linear(cls: Type['WeightOnlyQLinear'],
                       group_size)
         qlinear.bias = linear.bias
 
-        qparams = quantizer.calculate_qparams(linear.weight)
+        if qparams is not None:
+            qparams = quantizer.calculate_qparams(linear.weight)
         i32_w = quantizer.quant(linear.weight, qparams, real=True)
         i32_w = i32_w.t().contiguous()
 
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
index 8c8378e679..5ce6e6b0ce 100644
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -75,7 +75,6 @@ def auto_awq(model: str,
     fc2fcs = FC_FCS_MAP[layer_type]
     norm2fcs = NORM_FCS_MAP[layer_type]
     input_stats = torch.load(work_dir / 'inputs_stats.pth')
-    act_scales = input_stats['absmax']
     layers = collect_target_modules(model, layer_type)
     fcs = {}
     for l_name, layer in layers.items():
@@ -84,9 +83,11 @@ def auto_awq(model: str,
 
     if search_scale:
         awq_ratios = input_stats['ratios']
+        act_scales = input_stats['absmean']
         awq_layers(layers, fc2fcs, norm2fcs, act_scales, awq_ratios,
                    w_group_size, device)
     else:
+        act_scales = input_stats['absmax']
         smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size,
                       device)
     quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
diff --git a/lmdeploy/lite/quantization/activation/observer.py b/lmdeploy/lite/quantization/activation/observer.py
index a2249dadb4..3989c77966 100644
--- a/lmdeploy/lite/quantization/activation/observer.py
+++ b/lmdeploy/lite/quantization/activation/observer.py
@@ -62,6 +62,7 @@ class ActivationObserver(GlobalAvailMixin):
 
     Also keeps track of the number of batches observed.
     """
+    observed = False
 
     def __init__(self, dim: int) -> None:
         """Constructor for ActivationObserver.
@@ -80,6 +81,16 @@ def __init__(self, dim: int) -> None:
         self.ratio = None
         self.num_ratio_tracked = 0
 
+    @classmethod
+    def disable(cls):
+        """To avoid recomputation in search scale process."""
+        cls.observed = True
+
+    @classmethod
+    def enable(cls):
+        """To avoid recomputation in search scale process."""
+        cls.observed = False
+
     @torch.no_grad()
     def observe(self, x: torch.Tensor, save_input: bool = False) -> None:
         """Function to observe the input tensor and update the max, min, mean,
@@ -88,6 +99,8 @@ def observe(self, x: torch.Tensor, save_input: bool = False) -> None:
         Args:
             x : Input tensor
         """
+        if self.observed:
+            return
         assert len(x.shape) == 3
         assert x.size(2) == self.dim
         cur_val = x.flatten(0, 1)
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index ebf920f908..56752b777a 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -100,7 +100,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
 
     scales = scales / (scales[nonzero_positions].max() *
                        scales[nonzero_positions].min()).sqrt()
@@ -151,7 +151,7 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
     w_scales = get_weight_scale(concat_w, group_size)
 
     scales = (act_scales.pow(alpha) /
-              w_scales.pow(1 - alpha)).to(device).to(dtype)
+              w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
     scales = scales / (scales.max() * scales.min()).sqrt()
 
     # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
@@ -211,11 +211,16 @@ def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
     """Quantize the weights of the target model's linear layers."""
     from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
     from lmdeploy.lite.quantization import WeightQuantizer
+    from lmdeploy.lite.utils import QParams
     for name, fc in fcs.items():
         fc.to(device)
         quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)
-        q_linear = WeightOnlyQLinear.from_linear(fc, quantizer)
-
+        fc.weight.data, scales, zeros = pseudo_quantize_tensor(
+            fc.weight.data, bits, group_size, return_scale_zeros=True)
+        q_linear = WeightOnlyQLinear.from_linear(fc,
+                                                 quantizer,
+                                                 qparams=QParams(
+                                                     scales, zeros))
         parent_name, _, child_name = name.rpartition('.')
         parent = model.get_submodule(parent_name)
         fc.to('cpu')
@@ -253,7 +258,10 @@ def smooth_layers(layers,
         print(f'{l_name} smooth weight done.')
 
 
-def pseudo_quantize_tensor(w, w_bit=8, w_group_size=-1):
+def pseudo_quantize_tensor(w,
+                           w_bit=8,
+                           w_group_size=-1,
+                           return_scale_zeros=False):
     """Pseudo quantize tensor."""
     org_w_shape = w.shape
     if w_group_size > 0:
@@ -274,6 +282,10 @@ def pseudo_quantize_tensor(w, w_bit=8, w_group_size=-1):
     assert torch.isnan(w).sum() == 0
 
     w = w.reshape(org_w_shape)
+    if return_scale_zeros:
+        zeros = zeros.view(org_w_shape[0], -1)
+        scales = scales.view(org_w_shape[0], -1)
+        return w, scales, zeros
     return w
 
 
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index 03a1b42c06..97dd2defa5 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -501,8 +501,10 @@ def _forward(mod, *args, **kwargs):
                     *batch_args[i], **batch_kwargs[i]))
                 obs_group = ActivationObserver.find_group(self.inp_obs_group)
                 mod_name = self.mod2name[mod]
+                ActivationObserver.disable()
                 auto_scale_block(mod, batch_kwargs[i], self.w_bits,
                                  self.w_group_size, obs_group, mod_name)
+                ActivationObserver.enable()
             for key, item in obs_group.items():
                 if key.startswith(f'{mod_name}.'):
                     item.value.cpu()