Skip to content

Commit

Permalink
add xcomposer-vl quant
Browse files Browse the repository at this point in the history
  • Loading branch information
AllentDan committed May 13, 2024
1 parent 96365e0 commit 2b533f7
Show file tree
Hide file tree
Showing 9 changed files with 41 additions and 22 deletions.
8 changes: 7 additions & 1 deletion lmdeploy/lite/apis/auto_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,13 @@ def auto_awq(model: str,
fcs.update(name2fc)

smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size, device)
quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
quant_weights(model,
fcs,
w_bits,
w_sym,
w_group_size,
device,
skip_if_contains='Plora') # TODO quant lora weight
quantization_config = dict(quant_method='awq',
version='gemm',
bits=w_bits,
Expand Down
23 changes: 17 additions & 6 deletions lmdeploy/lite/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,21 +207,32 @@ def check_awq_supported(layer_type):
raise NotImplementedError


def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
def quant_weights(model,
fcs,
bits,
symmetry,
group_size=-1,
device='cuda',
skip_if_contains: str = None):
"""Quantize the weights of the target model's linear layers."""
from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
from lmdeploy.lite.quantization import WeightQuantizer
for name, fc in fcs.items():
fc.to(device)
quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)
q_linear = WeightOnlyQLinear.from_linear(fc, quantizer)

parent_name, _, child_name = name.rpartition('.')
parent = model.get_submodule(parent_name)
fc.to('cpu')
pack_or_skip = 'packed'
if skip_if_contains and skip_if_contains in child_name:
q_linear = fc
pack_or_skip = 'skipped'
else:
quantizer = WeightQuantizer(bits, symmetry, 'per_group',
group_size)
q_linear = WeightOnlyQLinear.from_linear(fc, quantizer)
setattr(parent, child_name, q_linear)
fc.to('cpu')

print(f'{name} weight packed.')
print(f'{name} weight {pack_or_skip}.')


def smooth_layers(layers,
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/deploy/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ def main(model_name: str,
if inferred_model_format.find('awq') != -1:
cfg.weight_type = 'int4'
output_format = 'w4'
if 'xcomposer2' in inferred_model_format:
output_format = 'plora-w4'
assert group_size > 0, f'group_size: {group_size} should > 0'
else:
output_format = update_output_format(model_name, inferred_model_format,
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
from .qwen import QwenModel # noqa: F401
from .qwen_awq import QwenAwqModel # noqa: F401
from .xcomposer2 import Xcomposer2Model # noqa: F401
from .xcomposer2_awq import Xcomposer2AwqModel # noqa: F401
5 changes: 3 additions & 2 deletions lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import torch

from .base import INPUT_MODELS
from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader
from .llama_awq import ensure_fp16orint32
from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader


class Xcomposer2AwqReader(Xcomposer2Reader):
"""LlamaAwqReader."""
Expand Down Expand Up @@ -48,7 +49,6 @@ def attn_lora_b(self, i):
"""Get attn lora_b."""
return super()._attn(i, 'Plora_B.weight', 0, 0)


def ffn(self, i: int):
"""Get ffn qweight for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qweight'))
Expand All @@ -61,6 +61,7 @@ def ffn_scale(self, i: int):
"""Get ffn scales for layer i."""
return ensure_fp16orint32(self._ffn(i, 'scales'))


@INPUT_MODELS.register_module(name='xcomposer2-awq')
class Xcomposer2AwqModel(Xcomposer2Model):
"""Llama Awq model in hf format."""
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/target_model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .fp import TurbomindModel # noqa: F401
from .plora import TurbomindPloraModel # noqa: F401
from .plora_w4 import TurbomindPloraW4Model # noqa: F401
from .w4 import TurbomindW4Model # noqa: F401
13 changes: 4 additions & 9 deletions lmdeploy/turbomind/deploy/target_model/plora_w4.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List

import torch

from ..source_model.base import BaseInputModel, BaseReader
from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
merge_qkv, permute)
from .base import OUTPUT_MODELS, TurbomindModelConfig, merge_qkv, permute
from .plora import TurbomindPloraModel, transpose_tensor
from .w4 import get_cuda_tensor, tp_m_s4, transpose_qk_s4, fuse_w1_w3_s4, convert_s4
from .w4 import (convert_s4, fuse_w1_w3_s4, get_cuda_tensor, tp_m_s4,
transpose_qk_s4)


@OUTPUT_MODELS.register_module(name=['plora-w4'])
class TurbomindPloraW4Model(TurbomindPloraModel):
Expand Down Expand Up @@ -71,7 +68,6 @@ def export_transformer_block(self, bin: BaseReader, i: int):
self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)


q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
if q_b is not None:
q_b = permute(q_b, size_per_head)
Expand All @@ -97,7 +93,6 @@ def export_transformer_block(self, bin: BaseReader, i: int):
self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)


# attn lora_a
lora_a_qkv, lora_a_o = bin.attn_lora_a(i)
lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o])
Expand Down
6 changes: 4 additions & 2 deletions lmdeploy/turbomind/turbomind.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,14 +264,16 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
output_format = 'w4'
data_type = 'int4'
cfg.group_size = 128
if inferred_model_format == 'xcomposer2-awq':
output_format = 'plora-w4'
else:
output_format = update_output_format(cfg.model_name,
inferred_model_format,
model_path, output_format)
data_type = output_format
update_config_weight_type(output_format, cfg)
if inferred_model_format == 'xcomposer2':
output_format = 'plora'
if inferred_model_format == 'xcomposer2':
output_format = 'plora'

input_model = INPUT_MODELS.get(inferred_model_format)(
model_path=model_path, tokenizer_path=model_path, ckpt_path=None)
Expand Down
4 changes: 2 additions & 2 deletions src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
}

if (weights.lora.r > 0) {
FT_CHECK(bit_size >= 16);
// FT_CHECK(bit_size >= 16);
deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
}
Expand Down Expand Up @@ -203,7 +203,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}

if (weights.lora.r) {
FT_CHECK(bit_size >= 16);
// FT_CHECK(bit_size >= 16);
auto n = prefix.rfind(".");
std::string _prefix = prefix.substr(0, n);
std::string _num = prefix.substr(n + 1);
Expand Down

0 comments on commit 2b533f7

Please sign in to comment.