Skip to content

Commit

Permalink
Support torch_dtype modification and update FAQs for AWQ quantization (
Browse files Browse the repository at this point in the history
…#2898)

* Support torch_dtype modification and update FAQs for AWQ quantization

* fix lint

* add clamp-zeros option

* add guidance

* datasets proxy

* remove clamp_zeros

* fix comments

* print

* fix ut hf-token

---------

Co-authored-by: RunningLeon <[email protected]>
  • Loading branch information
AllentDan and RunningLeon authored Dec 25, 2024
1 parent 35a5591 commit 9565505
Show file tree
Hide file tree
Showing 12 changed files with 97 additions and 28 deletions.
1 change: 0 additions & 1 deletion .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ jobs:
echo "TODO"
- name: Test lmdeploy python UT
run: |
huggingface-cli login --token ${{ secrets.HF_TOKEN }}
coverage run --branch --source lmdeploy -m pytest -rsE tests
coverage xml
coverage report -m
Expand Down
4 changes: 4 additions & 0 deletions docs/en/quantization/w4a16.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,7 @@ We benchmarked the Llama-2-7B-chat and Llama-2-13B-chat models with 4-bit quanti
| ---------------- | ------- | ------- | --------- |
| Llama-2-7B-chat | 112.9 | 159.4 | 206.4 |
| Llama-2-13B-chat | N/A | 90.7 | 115.8 |

## FAQs

1. Out of Memory error during quantization due to insufficient GPU memory: This can be addressed by reducing the parameter `--calib-seqlen`, increasing the parameter `--calib-samples`, and set `--batch-size` to 1.
5 changes: 5 additions & 0 deletions docs/zh_cn/quantization/w4a16.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,8 @@ lmdeploy serve api_client http://0.0.0.0:23333
| ---------------- | ------- | ------- | --------- |
| Llama-2-7B-chat | 112.9 | 159.4 | 206.4 |
| Llama-2-13B-chat | N/A | 90.7 | 115.8 |

## 快速问答

1. 量化时出现 Out of Memory 显存不够:可以通过减小传参 `--calib-seqlen`,增大传参 `--calib-samples`,并使用 `--batch-size` 为 1。
2. 量化时,无法链接huggingface并下载数据集。可以尝试使用镜像,`export HF_ENDPOINT=https://hf-mirror.com`
4 changes: 4 additions & 0 deletions lmdeploy/cli/lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def add_parser_auto_awq():
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.calib_search_scale(parser)
ArgumentHelper.dtype(parser)
parser.add_argument(
'--device',
type=str,
Expand Down Expand Up @@ -71,6 +72,7 @@ def add_parser_auto_gptq():
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.dtype(parser)
parser.add_argument('--w-bits',
type=int,
default=4,
Expand Down Expand Up @@ -99,6 +101,7 @@ def add_parser_calibrate():
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.calib_search_scale(parser)
ArgumentHelper.dtype(parser)

@staticmethod
def add_parser_smooth_quant():
Expand All @@ -122,6 +125,7 @@ def add_parser_smooth_quant():
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.calib_batchsize(parser)
ArgumentHelper.calib_search_scale(parser)
ArgumentHelper.dtype(parser)

@staticmethod
def auto_awq(args):
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def calib_batchsize(parser):

@staticmethod
def calib_search_scale(parser):
"""Add argument batch_size to parser."""
"""Add argument search_scale to parser."""

return parser.add_argument(
'--search-scale',
Expand Down
8 changes: 5 additions & 3 deletions lmdeploy/lite/apis/auto_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import os.path as osp
import shutil
from typing import Literal

import torch
from torch import nn
Expand All @@ -12,9 +13,7 @@
from lmdeploy.lite.utils import collect_target_modules
from lmdeploy.pytorch.check_env import try_import_deeplink

from .calibrate import LAYER_TYPE_MAP, NORM_TYPE_MAP, calibrate

NORM_TYPE_MAP = NORM_TYPE_MAP # legacy
from .calibrate import LAYER_TYPE_MAP, calibrate


def save_vl_model(vl_model, model_path, dst_path):
Expand Down Expand Up @@ -56,6 +55,7 @@ def auto_awq(model: str,
search_scale: bool = False,
device: str = 'cuda',
revision: str = None,
dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
download_dir: str = None):
"""Perform weight quantization using AWQ algorithm.
Expand All @@ -77,6 +77,7 @@ def auto_awq(model: str,
revision (str): The specific model version to use. It can be a
branch name, a tag name, or a commit id. If unspecified,
will use the default version.
dtype (str): Data type for loading model weights and calib infer.
download_dir (str): Directory to download and load the weights,
default to the default cache directory of huggingface.
"""
Expand All @@ -96,6 +97,7 @@ def auto_awq(model: str,
w_bits=w_bits,
w_group_size=w_group_size,
search_scale=search_scale,
dtype=dtype,
batch_size=batch_size)

layer_type = LAYER_TYPE_MAP[type(model).__name__]
Expand Down
12 changes: 9 additions & 3 deletions lmdeploy/lite/apis/calibrate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.

from pathlib import Path
from typing import Union
from typing import Literal, Union

import torch
from torch import nn
Expand Down Expand Up @@ -205,6 +205,7 @@ def calibrate(model: str,
w_bits: int = 4,
w_group_size: int = 128,
search_scale: bool = False,
dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
batch_size: int = 1) -> None:
"""The main function for loading the model and performing calibration on a
given dataset.
Expand All @@ -225,6 +226,7 @@ def calibrate(model: str,
w_group_size (int): Group size for weight quantization statistics.
search_scale (bool): Whether search scale ratio. Default to False,
which means only smooth quant with 0.5 ratio will be applied.
dtype (str): Data type for loading model weights and calib infer.
batch_size (int): The batch size for running the calib samples.
Low GPU mem requires small batch_size. Large batch_size
reduces the calibration time while costs more VRAM.
Expand All @@ -246,7 +248,7 @@ def calibrate(model: str,

if model_type == 'llm':
model = load_hf_from_pretrained(model,
torch_dtype=torch.float16,
dtype=dtype,
trust_remote_code=True)
vl_model = None
elif model_type == 'vlm':
Expand All @@ -257,7 +259,11 @@ def calibrate(model: str,
if hasattr(vl_model, 'llm'): # MiniCPMV
model = vl_model.llm
model.config.use_cache = False
model.half().eval()
if dtype == 'float16':
model.half()
elif dtype == 'bfloat16':
model.to(torch.bfloat16)
model.eval()

model_type = type(model).__name__
if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
Expand Down
17 changes: 13 additions & 4 deletions lmdeploy/lite/apis/gptq.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Copyright (c) OpenMMLab. All rights reserved.
import logging
from typing import Literal

import torch
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoTokenizer

from lmdeploy.lite.utils.calib_dataloader import get_calib_loaders

Expand All @@ -15,6 +16,7 @@ def auto_gptq(model: str,
calib_samples: int = 128,
calib_seqlen: int = 2048,
batch_size: int = 1,
dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
revision: str = None):
"""Perform weight quantization using AWQ algorithm.
Expand All @@ -29,9 +31,7 @@ def auto_gptq(model: str,
calib_seqlen (int): The sequence length for calibration.
w_bits (int): Bit number for weight quantization.
w_group_size (int): Group size for weight quantization statistics.
search_scale (bool): Whether search scale ratio. Default to False,
which means only smooth quant with 0.5 ratio will be applied.
device (str): Device type of running.
dtype (str): Data type for loading model weights and calib infer.
revision (str): The specific model version to use. It can be a
branch name, a tag name, or a commit id. If unspecified,
will use the default version.
Expand Down Expand Up @@ -83,9 +83,18 @@ def auto_gptq(model: str,

# load un-quantized model, by default,
# the model will always be loaded into CPU memory
hf_config = AutoConfig.from_pretrained(pretrained_model_dir,
revision=revision,
trust_remote_code=True)
torch_dtype = getattr(hf_config, 'torch_dtype', torch.float16)
if dtype == 'float16':
torch_dtype = torch.float16
elif dtype == 'bfloat16':
torch_dtype = torch.bfloat16
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir,
quantize_config,
revision=revision,
torch_dtype=torch_dtype,
trust_remote_code=True)

# quantize model, the examples should be list of dict whose keys
Expand Down
13 changes: 11 additions & 2 deletions lmdeploy/lite/apis/smooth_quant.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.

from typing import Literal

import fire
import torch
from torch import nn

from lmdeploy.lite.apis.calibrate import (LAYER_TYPE_MAP, NORM_TYPE_MAP,
calibrate)
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
awq_layers, smooth_layers)
awq_layers, skipped_module,
smooth_layers)
from lmdeploy.lite.utils import collect_target_modules
from lmdeploy.pytorch.models import QLinear, QRMSNorm

Expand All @@ -19,8 +23,8 @@ def smooth_quant(model: str,
search_scale: bool = False,
batch_size: int = 1,
w_bits: int = 8,
dtype: Literal['float16', 'bfloat16', 'auto'] = 'auto',
device: str = 'cuda'):

model_path = model
vl_model, model, tokenizer, work_dir = calibrate(model,
calib_dataset,
Expand All @@ -31,6 +35,7 @@ def smooth_quant(model: str,
w_bits=w_bits,
w_group_size=-1,
search_scale=search_scale,
dtype=dtype,
batch_size=batch_size)

# calibrate function exports the calibration statistics
Expand Down Expand Up @@ -76,6 +81,8 @@ def smooth_quant(model: str,
rmsnorms = collect_target_modules(model, norm_type)

for name, linear in fcs.items():
if skipped_module(name):
continue
linear.to(device)
q_linear = QLinear.from_float(linear)
parent_name, _, child_name = name.rpartition('.')
Expand All @@ -84,6 +91,8 @@ def smooth_quant(model: str,
linear.to('cpu')

for name, norm in rmsnorms.items():
if skipped_module(name):
continue
norm.to(device)
q_norm = QRMSNorm.from_float(norm)
parent_name, _, child_name = name.rpartition('.')
Expand Down
27 changes: 22 additions & 5 deletions lmdeploy/lite/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@
'MixtralDecoderLayer': {
'input_layernorm':
['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
'post_attention_layernorm':
['block_sparse_moe.experts.{i}.w1', 'block_sparse_moe.experts.{i}.w3']
'post_attention_layernorm': [
'block_sparse_moe.gate', 'block_sparse_moe.experts.{i}.w1',
'block_sparse_moe.experts.{i}.w3'
]
},
'Qwen2VLDecoderLayer': {
'input_layernorm':
Expand Down Expand Up @@ -120,7 +122,12 @@ def get_weight_scale(weight, q_group_size=-1):
org_shape = weight.shape
if q_group_size > 0:
weight = weight.view(-1, q_group_size)
scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
abs_weight = weight.abs()
abs_weight_amax = abs_weight.amax(dim=1, keepdim=True)
if abs_weight_amax.min().item() == 0:
print('weight.amax.min is zero, clamping weight.amax to 1e-4')
abs_weight_amax = abs_weight_amax.clamp(min=1e-4)
scale = abs_weight / abs_weight_amax
scale = scale.view(org_shape)
scale = scale.mean(0)
return scale
Expand Down Expand Up @@ -153,8 +160,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
w_scales = get_weight_scale(concat_w, group_size)

w_scales_pow = w_scales.pow(1 - alpha)
if w_scales_pow.min().item() == 0:
print('w_scales.pow(1 - alpha).min is zero, '
'clamping w_scales.pow(1 - alpha) to 1e-4')
w_scales_pow = w_scales_pow.clamp(min=1e-4)
scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
w_scales_pow).clamp(min=1e-4).to(device).to(dtype)

scales = scales / (scales[nonzero_positions].max() *
scales[nonzero_positions].min()).sqrt()
Expand Down Expand Up @@ -204,8 +216,13 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
w_scales = get_weight_scale(concat_w, group_size)

w_scales_pow = w_scales.pow(1 - alpha)
if w_scales_pow.min().item() == 0:
print('w_scales.pow(1 - alpha).min is zero, '
'clamping w_scales.pow(1 - alpha) to 1e-4')
w_scales_pow = w_scales_pow.clamp(min=1e-4)
scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).clamp(min=1e-4).to(device).to(dtype)
w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()

# (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
Expand Down
14 changes: 11 additions & 3 deletions lmdeploy/lite/quantization/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self,
tokenizer (PreTrainedTokenizer): Tokenizer of the given model.
layer_type (Union[str, type]): Type of the layers to be observed.
norm_type (Union[str, type]): Norm type used in the model.
batch_size (int): The batch size for running the calib samples.
Low GPU mem requires small batch_size. Large batch_size
reduces the calibration time while costs more VRAM.
device (str, optional): Device where the model should run.
Defaults to 'cuda'.
"""
Expand Down Expand Up @@ -290,9 +293,14 @@ def _search_module_scale(block, linears2scale: list, x, kwargs={}):

org_sd = {k: v.cpu() for k, v in block.state_dict().items()}
for ratio in range(0, n_grid):
ratio = ratio * 1 / n_grid
scales = (x_max.pow(ratio) /
w_mean.pow(1 - ratio)).clamp(min=1e-4).view(-1)
ratio = ratio / n_grid
w_mean_pow = w_mean.pow(1 - ratio)
if w_mean_pow.min().item() == 0:
print('w_mean.pow(1 - ratio).min is zero, '
'clamping w_mean.pow(1 - ratio) to 1e-4')
w_mean_pow = w_mean_pow.clamp(min=1e-4)
scales = (x_max.pow(ratio) / w_mean_pow).clamp(min=1e-4).view(-1)

scales = scales / (scales.max() * scales.min()).sqrt()
for fc in linears2scale:
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
Expand Down
18 changes: 12 additions & 6 deletions lmdeploy/lite/utils/load.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.

from typing import Literal

import torch
from transformers import AutoConfig, AutoModelForCausalLM

from lmdeploy.pytorch.accel import LoadNoInit


def load_hf_from_pretrained(pretrained_model_name_or_path,
dtype=torch.float16,
**kwargs):
dtype: Literal['float16', 'bfloat16',
'auto'], **kwargs):

if dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
raise RuntimeError('Your device does not supports bf16(bfloat16), '
Expand All @@ -21,10 +23,14 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
trust_remote_code=True)

# HACK hard code for qwen, other configs do not have the `fp16` attribute.
if dtype == torch.float16:
hf_config.fp16 = True
elif dtype == torch.bfloat16:
hf_config.bf16 = True
if hasattr(hf_config, 'fp16') or hasattr(hf_config, 'bf16'):
if dtype == 'bfloat16':
hf_config.bf16 = True
else:
hf_config.fp16 = True

if dtype != 'auto':
setattr(hf_config, 'torch_dtype', dtype)

with LoadNoInit():
# Load model
Expand Down

0 comments on commit 9565505

Please sign in to comment.