From 9565505989c911f996fea646bc945cf8f276532f Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Wed, 25 Dec 2024 12:21:01 +0800 Subject: [PATCH] Support torch_dtype modification and update FAQs for AWQ quantization (#2898) * Support torch_dtype modification and update FAQs for AWQ quantization * fix lint * add clamp-zeros option * add guidance * datasets proxy * remove clamp_zeros * fix comments * print * fix ut hf-token --------- Co-authored-by: RunningLeon --- .github/workflows/unit-test.yml | 1 - docs/en/quantization/w4a16.md | 4 ++++ docs/zh_cn/quantization/w4a16.md | 5 +++++ lmdeploy/cli/lite.py | 4 ++++ lmdeploy/cli/utils.py | 2 +- lmdeploy/lite/apis/auto_awq.py | 8 ++++--- lmdeploy/lite/apis/calibrate.py | 12 +++++++--- lmdeploy/lite/apis/gptq.py | 17 ++++++++++---- lmdeploy/lite/apis/smooth_quant.py | 13 +++++++++-- lmdeploy/lite/quantization/awq.py | 27 ++++++++++++++++++----- lmdeploy/lite/quantization/calibration.py | 14 +++++++++--- lmdeploy/lite/utils/load.py | 18 ++++++++++----- 12 files changed, 97 insertions(+), 28 deletions(-) diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 14c865e801..3a459050ec 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -91,7 +91,6 @@ jobs: echo "TODO" - name: Test lmdeploy python UT run: | - huggingface-cli login --token ${{ secrets.HF_TOKEN }} coverage run --branch --source lmdeploy -m pytest -rsE tests coverage xml coverage report -m diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md index 0aa1e17a5b..c36c3736c6 100644 --- a/docs/en/quantization/w4a16.md +++ b/docs/en/quantization/w4a16.md @@ -128,3 +128,7 @@ We benchmarked the Llama-2-7B-chat and Llama-2-13B-chat models with 4-bit quanti | ---------------- | ------- | ------- | --------- | | Llama-2-7B-chat | 112.9 | 159.4 | 206.4 | | Llama-2-13B-chat | N/A | 90.7 | 115.8 | + +## FAQs + +1. Out of Memory error during quantization due to insufficient GPU memory: This can be addressed by reducing the parameter `--calib-seqlen`, increasing the parameter `--calib-samples`, and set `--batch-size` to 1. diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md index d69a8a23d2..3cea164dd9 100644 --- a/docs/zh_cn/quantization/w4a16.md +++ b/docs/zh_cn/quantization/w4a16.md @@ -131,3 +131,8 @@ lmdeploy serve api_client http://0.0.0.0:23333 | ---------------- | ------- | ------- | --------- | | Llama-2-7B-chat | 112.9 | 159.4 | 206.4 | | Llama-2-13B-chat | N/A | 90.7 | 115.8 | + +## 快速问答 + +1. 量化时出现 Out of Memory 显存不够:可以通过减小传参 `--calib-seqlen`,增大传参 `--calib-samples`,并使用 `--batch-size` 为 1。 +2. 量化时,无法链接huggingface并下载数据集。可以尝试使用镜像,`export HF_ENDPOINT=https://hf-mirror.com`。 diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py index d76d6a5f34..236e022b34 100644 --- a/lmdeploy/cli/lite.py +++ b/lmdeploy/cli/lite.py @@ -35,6 +35,7 @@ def add_parser_auto_awq(): ArgumentHelper.calib_seqlen(parser) ArgumentHelper.calib_batchsize(parser) ArgumentHelper.calib_search_scale(parser) + ArgumentHelper.dtype(parser) parser.add_argument( '--device', type=str, @@ -71,6 +72,7 @@ def add_parser_auto_gptq(): ArgumentHelper.calib_samples(parser) ArgumentHelper.calib_seqlen(parser) ArgumentHelper.calib_batchsize(parser) + ArgumentHelper.dtype(parser) parser.add_argument('--w-bits', type=int, default=4, @@ -99,6 +101,7 @@ def add_parser_calibrate(): ArgumentHelper.calib_seqlen(parser) ArgumentHelper.calib_batchsize(parser) ArgumentHelper.calib_search_scale(parser) + ArgumentHelper.dtype(parser) @staticmethod def add_parser_smooth_quant(): @@ -122,6 +125,7 @@ def add_parser_smooth_quant(): ArgumentHelper.calib_seqlen(parser) ArgumentHelper.calib_batchsize(parser) ArgumentHelper.calib_search_scale(parser) + ArgumentHelper.dtype(parser) @staticmethod def auto_awq(args): diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 85784a58f5..cf7b6526ec 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -354,7 +354,7 @@ def calib_batchsize(parser): @staticmethod def calib_search_scale(parser): - """Add argument batch_size to parser.""" + """Add argument search_scale to parser.""" return parser.add_argument( '--search-scale', diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py index c41b28fd6e..2c84612839 100644 --- a/lmdeploy/lite/apis/auto_awq.py +++ b/lmdeploy/lite/apis/auto_awq.py @@ -2,6 +2,7 @@ import os import os.path as osp import shutil +from typing import Literal import torch from torch import nn @@ -12,9 +13,7 @@ from lmdeploy.lite.utils import collect_target_modules from lmdeploy.pytorch.check_env import try_import_deeplink -from .calibrate import LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate - -NORM_TYPE_MAP = NORM_TYPE_MAP # legacy +from .calibrate import LAYER_TYPE_MAP, calibrate def save_vl_model(vl_model, model_path, dst_path): @@ -56,6 +55,7 @@ def auto_awq(model: str, search_scale: bool = False, device: str = 'cuda', revision: str = None, + dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto', download_dir: str = None): """Perform weight quantization using AWQ algorithm. @@ -77,6 +77,7 @@ def auto_awq(model: str, revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. + dtype (str): Data type for loading model weights and calib infer. download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface. """ @@ -96,6 +97,7 @@ def auto_awq(model: str, w_bits=w_bits, w_group_size=w_group_size, search_scale=search_scale, + dtype=dtype, batch_size=batch_size) layer_type = LAYER_TYPE_MAP[type(model).__name__] diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py index 4759e95b4f..307cf6d7e9 100644 --- a/lmdeploy/lite/apis/calibrate.py +++ b/lmdeploy/lite/apis/calibrate.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from pathlib import Path -from typing import Union +from typing import Literal, Union import torch from torch import nn @@ -205,6 +205,7 @@ def calibrate(model: str, w_bits: int = 4, w_group_size: int = 128, search_scale: bool = False, + dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto', batch_size: int = 1) -> None: """The main function for loading the model and performing calibration on a given dataset. @@ -225,6 +226,7 @@ def calibrate(model: str, w_group_size (int): Group size for weight quantization statistics. search_scale (bool): Whether search scale ratio. Default to False, which means only smooth quant with 0.5 ratio will be applied. + dtype (str): Data type for loading model weights and calib infer. batch_size (int): The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM. @@ -246,7 +248,7 @@ def calibrate(model: str, if model_type == 'llm': model = load_hf_from_pretrained(model, - torch_dtype=torch.float16, + dtype=dtype, trust_remote_code=True) vl_model = None elif model_type == 'vlm': @@ -257,7 +259,11 @@ def calibrate(model: str, if hasattr(vl_model, 'llm'): # MiniCPMV model = vl_model.llm model.config.use_cache = False - model.half().eval() + if dtype == 'float16': + model.half() + elif dtype == 'bfloat16': + model.to(torch.bfloat16) + model.eval() model_type = type(model).__name__ if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP: diff --git a/lmdeploy/lite/apis/gptq.py b/lmdeploy/lite/apis/gptq.py index 12b88a52cd..eb4418a533 100644 --- a/lmdeploy/lite/apis/gptq.py +++ b/lmdeploy/lite/apis/gptq.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging +from typing import Literal import torch -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoTokenizer from lmdeploy.lite.utils.calib_dataloader import get_calib_loaders @@ -15,6 +16,7 @@ def auto_gptq(model: str, calib_samples: int = 128, calib_seqlen: int = 2048, batch_size: int = 1, + dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto', revision: str = None): """Perform weight quantization using AWQ algorithm. @@ -29,9 +31,7 @@ def auto_gptq(model: str, calib_seqlen (int): The sequence length for calibration. w_bits (int): Bit number for weight quantization. w_group_size (int): Group size for weight quantization statistics. - search_scale (bool): Whether search scale ratio. Default to False, - which means only smooth quant with 0.5 ratio will be applied. - device (str): Device type of running. + dtype (str): Data type for loading model weights and calib infer. revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. @@ -83,9 +83,18 @@ def auto_gptq(model: str, # load un-quantized model, by default, # the model will always be loaded into CPU memory + hf_config = AutoConfig.from_pretrained(pretrained_model_dir, + revision=revision, + trust_remote_code=True) + torch_dtype = getattr(hf_config, 'torch_dtype', torch.float16) + if dtype == 'float16': + torch_dtype = torch.float16 + elif dtype == 'bfloat16': + torch_dtype = torch.bfloat16 model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config, revision=revision, + torch_dtype=torch_dtype, trust_remote_code=True) # quantize model, the examples should be list of dict whose keys diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py index c8df67355e..188eedbd0e 100644 --- a/lmdeploy/lite/apis/smooth_quant.py +++ b/lmdeploy/lite/apis/smooth_quant.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. + +from typing import Literal + import fire import torch from torch import nn @@ -6,7 +9,8 @@ from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate) from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP, - awq_layers, smooth_layers) + awq_layers, skipped_module, + smooth_layers) from lmdeploy.lite.utils import collect_target_modules from lmdeploy.pytorch.models import QLinear, QRMSNorm @@ -19,8 +23,8 @@ def smooth_quant(model: str, search_scale: bool = False, batch_size: int = 1, w_bits: int = 8, + dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto', device: str = 'cuda'): - model_path = model vl_model, model, tokenizer, work_dir = calibrate(model, calib_dataset, @@ -31,6 +35,7 @@ def smooth_quant(model: str, w_bits=w_bits, w_group_size=-1, search_scale=search_scale, + dtype=dtype, batch_size=batch_size) # calibrate function exports the calibration statistics @@ -76,6 +81,8 @@ def smooth_quant(model: str, rmsnorms = collect_target_modules(model, norm_type) for name, linear in fcs.items(): + if skipped_module(name): + continue linear.to(device) q_linear = QLinear.from_float(linear) parent_name, _, child_name = name.rpartition('.') @@ -84,6 +91,8 @@ def smooth_quant(model: str, linear.to('cpu') for name, norm in rmsnorms.items(): + if skipped_module(name): + continue norm.to(device) q_norm = QRMSNorm.from_float(norm) parent_name, _, child_name = name.rpartition('.') diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py index cf03a75216..3e24a13cc3 100644 --- a/lmdeploy/lite/quantization/awq.py +++ b/lmdeploy/lite/quantization/awq.py @@ -43,8 +43,10 @@ 'MixtralDecoderLayer': { 'input_layernorm': ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'], - 'post_attention_layernorm': - ['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3'] + 'post_attention_layernorm': [ + 'block_sparse_moe.gate', 'block_sparse_moe.experts.{i}.w1', + 'block_sparse_moe.experts.{i}.w3' + ] }, 'Qwen2VLDecoderLayer': { 'input_layernorm': @@ -120,7 +122,12 @@ def get_weight_scale(weight, q_group_size=-1): org_shape = weight.shape if q_group_size > 0: weight = weight.view(-1, q_group_size) - scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True) + abs_weight = weight.abs() + abs_weight_amax = abs_weight.amax(dim=1, keepdim=True) + if abs_weight_amax.min().item() == 0: + print('weight.amax.min is zero, clamping weight.amax to 1e-4') + abs_weight_amax = abs_weight_amax.clamp(min=1e-4) + scale = abs_weight / abs_weight_amax scale = scale.view(org_shape) scale = scale.mean(0) return scale @@ -153,8 +160,13 @@ def smooth_ln_fcs(ln: torch.nn.Module, concat_w = torch.cat([fc.weight for fc in fcs], dim=0) w_scales = get_weight_scale(concat_w, group_size) + w_scales_pow = w_scales.pow(1 - alpha) + if w_scales_pow.min().item() == 0: + print('w_scales.pow(1 - alpha).min is zero, ' + 'clamping w_scales.pow(1 - alpha) to 1e-4') + w_scales_pow = w_scales_pow.clamp(min=1e-4) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) + w_scales_pow).clamp(min=1e-4).to(device).to(dtype) scales = scales / (scales[nonzero_positions].max() * scales[nonzero_positions].min()).sqrt() @@ -204,8 +216,13 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module, concat_w = torch.cat([fc.weight for fc in fcs], dim=0) w_scales = get_weight_scale(concat_w, group_size) + w_scales_pow = w_scales.pow(1 - alpha) + if w_scales_pow.min().item() == 0: + print('w_scales.pow(1 - alpha).min is zero, ' + 'clamping w_scales.pow(1 - alpha) to 1e-4') + w_scales_pow = w_scales_pow.clamp(min=1e-4) scales = (act_scales.pow(alpha) / - w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype) + w_scales_pow).clamp(min=1e-4).to(device).to(dtype) scales = scales / (scales.max() * scales.min()).sqrt() # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py index e590f1a4eb..1df8f2c740 100644 --- a/lmdeploy/lite/quantization/calibration.py +++ b/lmdeploy/lite/quantization/calibration.py @@ -42,6 +42,9 @@ def __init__(self, tokenizer (PreTrainedTokenizer): Tokenizer of the given model. layer_type (Union[str, type]): Type of the layers to be observed. norm_type (Union[str, type]): Norm type used in the model. + batch_size (int): The batch size for running the calib samples. + Low GPU mem requires small batch_size. Large batch_size + reduces the calibration time while costs more VRAM. device (str, optional): Device where the model should run. Defaults to 'cuda'. """ @@ -290,9 +293,14 @@ def _search_module_scale(block, linears2scale: list, x, kwargs={}): org_sd = {k: v.cpu() for k, v in block.state_dict().items()} for ratio in range(0, n_grid): - ratio = ratio * 1 / n_grid - scales = (x_max.pow(ratio) / - w_mean.pow(1 - ratio)).clamp(min=1e-4).view(-1) + ratio = ratio / n_grid + w_mean_pow = w_mean.pow(1 - ratio) + if w_mean_pow.min().item() == 0: + print('w_mean.pow(1 - ratio).min is zero, ' + 'clamping w_mean.pow(1 - ratio) to 1e-4') + w_mean_pow = w_mean_pow.clamp(min=1e-4) + scales = (x_max.pow(ratio) / w_mean_pow).clamp(min=1e-4).view(-1) + scales = scales / (scales.max() * scales.min()).sqrt() for fc in linears2scale: fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) diff --git a/lmdeploy/lite/utils/load.py b/lmdeploy/lite/utils/load.py index bfd306a743..170c149778 100644 --- a/lmdeploy/lite/utils/load.py +++ b/lmdeploy/lite/utils/load.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Literal + import torch from transformers import AutoConfig, AutoModelForCausalLM @@ -7,8 +9,8 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, - dtype=torch.float16, - **kwargs): + dtype: Literal['float16', 'bfloat16', + 'auto'], **kwargs): if dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported(): raise RuntimeError('Your device does not supports bf16(bfloat16), ' @@ -21,10 +23,14 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) # HACK hard code for qwen, other configs do not have the `fp16` attribute. - if dtype == torch.float16: - hf_config.fp16 = True - elif dtype == torch.bfloat16: - hf_config.bf16 = True + if hasattr(hf_config, 'fp16') or hasattr(hf_config, 'bf16'): + if dtype == 'bfloat16': + hf_config.bf16 = True + else: + hf_config.fp16 = True + + if dtype != 'auto': + setattr(hf_config, 'torch_dtype', dtype) with LoadNoInit(): # Load model