Skip to content

Commit

Permalink
add xcomposer-vl quant
Browse files Browse the repository at this point in the history
  • Loading branch information
AllentDan committed May 13, 2024
1 parent 68c5b35 commit 96365e0
Show file tree
Hide file tree
Showing 2 changed files with 230 additions and 0 deletions.
78 changes: 78 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) OpenMMLab. All rights reserved.
import torch

from .base import INPUT_MODELS
from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader
from .llama_awq import ensure_fp16orint32

class Xcomposer2AwqReader(Xcomposer2Reader):
"""LlamaAwqReader."""

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)

def _attn(self, i: int, kind: str):
"""Get q, k, v, o qweight for layer i."""
kv_head_num = self.model_cfg['kv_head_num']
gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
hidden_dim = qkv.shape[0]
qkv = qkv.view(hidden_dim, kv_head_num, gs + 2, -1)
q, k, v = torch.split(qkv, [gs, 1, 1], dim=-2)
q = q.reshape(hidden_dim, -1)
k = k.reshape(hidden_dim, -1)
v = v.reshape(hidden_dim, -1)
o = self.params[f'model.layers.{i}.attention.wo.{kind}']
return ensure_fp16orint32((q, k, v, o))

def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
return self._attn(i, 'qweight')

def attn_zero(self, i: int):
"""Get q, k, v, o qzeros for layer i."""
return self._attn(i, 'qzeros')

def attn_scale(self, i: int):
"""Get q, k, v, o scales for layer i."""
return self._attn(i, 'scales')

def attn_lora_a(self, i):
"""Get attn lora_a."""
qkv = self.params[f'model.layers.{i}.attention.wqkv.Plora_A.weight']
o = self.params[f'model.layers.{i}.attention.wo.Plora_A.weight']
return qkv, o

def attn_lora_b(self, i):
"""Get attn lora_b."""
return super()._attn(i, 'Plora_B.weight', 0, 0)


def ffn(self, i: int):
"""Get ffn qweight for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qweight'))

def ffn_zero(self, i: int):
"""Get ffn qzeros for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qzeros'))

def ffn_scale(self, i: int):
"""Get ffn scales for layer i."""
return ensure_fp16orint32(self._ffn(i, 'scales'))

@INPUT_MODELS.register_module(name='xcomposer2-awq')
class Xcomposer2AwqModel(Xcomposer2Model):
"""Llama Awq model in hf format."""

Reader = Xcomposer2AwqReader

def __init__(self,
model_path: str,
tokenizer_path: str,
ckpt_path: str = None,
**kwargs):
super().__init__(model_path,
tokenizer_path,
ckpt_path=ckpt_path,
**kwargs)
152 changes: 152 additions & 0 deletions lmdeploy/turbomind/deploy/target_model/plora_w4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List

import torch

from ..source_model.base import BaseInputModel, BaseReader
from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
merge_qkv, permute)
from .plora import TurbomindPloraModel, transpose_tensor
from .w4 import get_cuda_tensor, tp_m_s4, transpose_qk_s4, fuse_w1_w3_s4, convert_s4

@OUTPUT_MODELS.register_module(name=['plora-w4'])
class TurbomindPloraW4Model(TurbomindPloraModel):
"""Export to turbomind plora w4 format."""

def __init__(self,
input_model: BaseInputModel,
cfg: TurbomindModelConfig,
to_file: bool = True,
out_dir: str = ''):
super().__init__(input_model, cfg, to_file, out_dir)

def get_config(self, cfg: TurbomindModelConfig):
"""Get turbomind config."""
final_cfg = super().get_config(cfg).__dict__

# attn_bias, inter_size
visit = False
attn_bias = 0
for bin in self.input_model.bins():
for i in range(bin.start_layer_id, bin.end_layer_id):
visit = True
w1s, _, _ = bin.ffn_scale(i)
inter_size = w1s.shape[-1]
qb, _, _, _ = bin.attn_bias(i)
if qb is not None:
attn_bias = 1
break
if visit:
break
final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
return TurbomindModelConfig.from_dict(final_cfg)

def export_transformer_block(self, bin: BaseReader, i: int):
"""Export transformer layer i."""
assert bin.start_layer_id <= i < bin.end_layer_id
group_size = self.cfg.group_size
tp = self.cfg.tensor_para_size
size_per_head = self.cfg.size_per_head
# attn
q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))

q_qw = transpose_qk_s4(q_qw, group_size)
k_qw = transpose_qk_s4(k_qw, group_size)
q_qz = transpose_qk_s4(q_qz, group_size)
k_qz = transpose_qk_s4(k_qz, group_size)
q_s = permute(q_s, size_per_head)
k_s = permute(k_s, size_per_head)

qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)

qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
qkv_qw = tp_m_s4(qkv_qw, tp)
self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)


q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
if q_b is not None:
q_b = permute(q_b, size_per_head)
k_b = permute(k_b, size_per_head)
qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)

# ffn weights
w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))

w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
w3_s)
w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
w13_qw = tp_m_s4(w13_qw, tp)
self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
-1)

w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)


# attn lora_a
lora_a_qkv, lora_a_o = bin.attn_lora_a(i)
lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o])
# print(lora_a_qkv.shape, lora_a_o.shape)
self.save_split(lora_a_qkv,
f'layers.{i}.attention.w_qkv.lora_a.weight',
copy=True)
self.save_split(lora_a_o, f'layers.{i}.attention.wo.lora_a.weight', 0)
# attn lora_b
lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = bin.attn_lora_b(i)
lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow = transpose_tensor(
[lora_b_qw, lora_b_kw, lora_b_vw, lora_b_ow])
lora_b_qw = permute(lora_b_qw, size_per_head)
lora_b_kw = permute(lora_b_kw, size_per_head)
lora_b_qkv_w = merge_qkv(lora_b_qw, lora_b_kw, lora_b_vw, tp, dim=2)
self.save_split(lora_b_qkv_w,
f'layers.{i}.attention.w_qkv.lora_b.weight', -1)
self.save_split(lora_b_ow,
f'layers.{i}.attention.wo.lora_b.weight',
copy=True)

# # ffn lora_a
lora_a_w1, lora_a_w2, lora_a_w3 = bin.ffn_lora_a(i)
lora_a_w1, lora_a_w2, lora_a_w3 = transpose_tensor(
[lora_a_w1, lora_a_w2, lora_a_w3])
# print('lora_a_w1', lora_a_w1.shape, lora_a_w2.shape, lora_a_w3.shape)
self.save_split(lora_a_w2, f'layers.{i}.feed_forward.w2.lora_a.weight',
0)
self.save_split(lora_a_w1,
f'layers.{i}.feed_forward.w1.lora_a.weight',
copy=True)
self.save_split(lora_a_w3,
f'layers.{i}.feed_forward.w3.lora_a.weight',
copy=True)
# # ffn lora_b
lora_b_w1, lora_b_w2, lora_b_w3 = bin.ffn_lora_b(i)
lora_b_w1, lora_b_w2, lora_b_w3 = transpose_tensor(
[lora_b_w1, lora_b_w2, lora_b_w3])
# print('lora_b_w1', lora_b_w1.shape, lora_b_w2.shape, lora_b_w3.shape)
self.save_split(lora_b_w1, f'layers.{i}.feed_forward.w1.lora_b.weight',
-1)
self.save_split(lora_b_w3, f'layers.{i}.feed_forward.w3.lora_b.weight',
-1)
self.save_split(lora_b_w2,
f'layers.{i}.feed_forward.w2.lora_b.weight',
copy=True)

# norm
attn_norm = bin.attn_norm(i)
ffn_norm = bin.ffn_norm(i)
self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')

0 comments on commit 96365e0

Please sign in to comment.