From f812587c91a3eefe5e0ef29130dc3a958a2ace62 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Thu, 16 May 2024 18:00:05 +0800 Subject: [PATCH] fix --- lmdeploy/legacy/pytorch/modules/linear.py | 8 +++++-- lmdeploy/lite/apis/auto_awq.py | 3 ++- .../lite/quantization/activation/observer.py | 13 +++++++++++ lmdeploy/lite/quantization/awq.py | 22 ++++++++++++++----- lmdeploy/lite/quantization/calibration.py | 2 ++ 5 files changed, 40 insertions(+), 8 deletions(-) diff --git a/lmdeploy/legacy/pytorch/modules/linear.py b/lmdeploy/legacy/pytorch/modules/linear.py index 218a36407e..3a06e69799 100644 --- a/lmdeploy/legacy/pytorch/modules/linear.py +++ b/lmdeploy/legacy/pytorch/modules/linear.py @@ -4,6 +4,8 @@ import torch from torch import nn +from lmdeploy.lite.utils.cal_qparams import QParams + try: import awq_inference_engine except ModuleNotFoundError: @@ -72,7 +74,8 @@ def __init__( def from_linear(cls: Type['WeightOnlyQLinear'], linear: nn.Linear, quantizer: TypeVar('Quantizer'), - awq_layout: bool = True) -> 'WeightOnlyQLinear': + awq_layout: bool = True, + qparams: Optional[QParams] = None) -> 'WeightOnlyQLinear': """Create a WeightOnlyQLinear object from a PyTorch Linear object. Args: @@ -103,7 +106,8 @@ def from_linear(cls: Type['WeightOnlyQLinear'], group_size) qlinear.bias = linear.bias - qparams = quantizer.calculate_qparams(linear.weight) + if qparams is not None: + qparams = quantizer.calculate_qparams(linear.weight) i32_w = quantizer.quant(linear.weight, qparams, real=True) i32_w = i32_w.t().contiguous() diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py index 8c8378e679..5ce6e6b0ce 100644 --- a/lmdeploy/lite/apis/auto_awq.py +++ b/lmdeploy/lite/apis/auto_awq.py @@ -75,7 +75,6 @@ def auto_awq(model: str, fc2fcs = FC_FCS_MAP[layer_type] norm2fcs = NORM_FCS_MAP[layer_type] input_stats = torch.load(work_dir / 'inputs_stats.pth') - act_scales = input_stats['absmax'] layers = collect_target_modules(model, layer_type) fcs = {} for l_name, layer in layers.items(): @@ -84,9 +83,11 @@ def auto_awq(model: str, if search_scale: awq_ratios = input_stats['ratios'] + act_scales = input_stats['absmean'] awq_layers(layers, fc2fcs, norm2fcs, act_scales, awq_ratios, w_group_size, device) else: + act_scales = input_stats['absmax'] smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size, device) quant_weights(model, fcs, w_bits, w_sym, w_group_size, device) diff --git a/lmdeploy/lite/quantization/activation/observer.py b/lmdeploy/lite/quantization/activation/observer.py index a2249dadb4..3989c77966 100644 --- a/lmdeploy/lite/quantization/activation/observer.py +++ b/lmdeploy/lite/quantization/activation/observer.py @@ -62,6 +62,7 @@ class ActivationObserver(GlobalAvailMixin): Also keeps track of the number of batches observed. """ + observed = False def __init__(self, dim: int) -> None: """Constructor for ActivationObserver. @@ -80,6 +81,16 @@ def __init__(self, dim: int) -> None: self.ratio = None self.num_ratio_tracked = 0 + @classmethod + def disable(cls): + """To avoid recomputation in search scale process.""" + cls.observed = True + + @classmethod + def enable(cls): + """To avoid recomputation in search scale process.""" + cls.observed = False + @torch.no_grad() def observe(self, x: torch.Tensor, save_input: bool = False) -> None: """Function to observe the input tensor and update the max, min, mean, @@ -88,6 +99,8 @@ def observe(self, x: torch.Tensor, save_input: bool = False) -> None: Args: x : Input tensor """ + if self.observed: + return assert len(x.shape) == 3 assert x.size(2) == self.dim cur_val = x.flatten(0, 1) diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py index ebf920f908..56752b777a 100644 --- a/lmdeploy/lite/quantization/awq.py +++ b/lmdeploy/lite/quantization/awq.py @@ -100,7 +100,7 @@ def smooth_ln_fcs(ln: torch.nn.Module, w_scales = get_weight_scale(concat_w, group_size) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).to(device).to(dtype) + w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) scales = scales / (scales[nonzero_positions].max() * scales[nonzero_positions].min()).sqrt() @@ -151,7 +151,7 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module, w_scales = get_weight_scale(concat_w, group_size) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).to(device).to(dtype) + w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) scales = scales / (scales.max() * scales.min()).sqrt() # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale @@ -211,11 +211,16 @@ def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'): """Quantize the weights of the target model's linear layers.""" from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear from lmdeploy.lite.quantization import WeightQuantizer + from lmdeploy.lite.utils import QParams for name, fc in fcs.items(): fc.to(device) quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size) - q_linear = WeightOnlyQLinear.from_linear(fc, quantizer) - + fc.weight.data, scales, zeros = pseudo_quantize_tensor( + fc.weight.data, bits, group_size, return_scale_zeros=True) + q_linear = WeightOnlyQLinear.from_linear(fc, + quantizer, + qparams=QParams( + scales, zeros)) parent_name, _, child_name = name.rpartition('.') parent = model.get_submodule(parent_name) fc.to('cpu') @@ -253,7 +258,10 @@ def smooth_layers(layers, print(f'{l_name} smooth weight done.') -def pseudo_quantize_tensor(w, w_bit=8, w_group_size=-1): +def pseudo_quantize_tensor(w, + w_bit=8, + w_group_size=-1, + return_scale_zeros=False): """Pseudo quantize tensor.""" org_w_shape = w.shape if w_group_size > 0: @@ -274,6 +282,10 @@ def pseudo_quantize_tensor(w, w_bit=8, w_group_size=-1): assert torch.isnan(w).sum() == 0 w = w.reshape(org_w_shape) + if return_scale_zeros: + zeros = zeros.view(org_w_shape[0], -1) + scales = scales.view(org_w_shape[0], -1) + return w, scales, zeros return w diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py index 03a1b42c06..97dd2defa5 100644 --- a/lmdeploy/lite/quantization/calibration.py +++ b/lmdeploy/lite/quantization/calibration.py @@ -501,8 +501,10 @@ def _forward(mod, *args, **kwargs): *batch_args[i], **batch_kwargs[i])) obs_group = ActivationObserver.find_group(self.inp_obs_group) mod_name = self.mod2name[mod] + ActivationObserver.disable() auto_scale_block(mod, batch_kwargs[i], self.w_bits, self.w_group_size, obs_group, mod_name) + ActivationObserver.enable() for key, item in obs_group.items(): if key.startswith(f'{mod_name}.'): item.value.cpu()