add xcomposer-vl quant

AllentDan · May 13, 2024 · 2b533f7 · 2b533f7
1 parent 96365e0
commit 2b533f7
Show file tree

Hide file tree

Showing 9 changed files with 41 additions and 22 deletions.
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -93,7 +93,13 @@ def auto_awq(model: str,
         fcs.update(name2fc)
 
     smooth_layers(layers, fc2fcs, norm2fcs, act_scales, w_group_size, device)
-    quant_weights(model, fcs, w_bits, w_sym, w_group_size, device)
+    quant_weights(model,
+                  fcs,
+                  w_bits,
+                  w_sym,
+                  w_group_size,
+                  device,
+                  skip_if_contains='Plora')  # TODO quant lora weight
     quantization_config = dict(quant_method='awq',
                                version='gemm',
                                bits=w_bits,

diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
@@ -207,21 +207,32 @@ def check_awq_supported(layer_type):
         raise NotImplementedError
 
 
-def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
+def quant_weights(model,
+                  fcs,
+                  bits,
+                  symmetry,
+                  group_size=-1,
+                  device='cuda',
+                  skip_if_contains: str = None):
     """Quantize the weights of the target model's linear layers."""
     from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
     from lmdeploy.lite.quantization import WeightQuantizer
     for name, fc in fcs.items():
         fc.to(device)
-        quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)
-        q_linear = WeightOnlyQLinear.from_linear(fc, quantizer)
-
         parent_name, _, child_name = name.rpartition('.')
         parent = model.get_submodule(parent_name)
-        fc.to('cpu')
+        pack_or_skip = 'packed'
+        if skip_if_contains and skip_if_contains in child_name:
+            q_linear = fc
+            pack_or_skip = 'skipped'
+        else:
+            quantizer = WeightQuantizer(bits, symmetry, 'per_group',
+                                        group_size)
+            q_linear = WeightOnlyQLinear.from_linear(fc, quantizer)
         setattr(parent, child_name, q_linear)
+        fc.to('cpu')
 
-        print(f'{name} weight packed.')
+        print(f'{name} weight {pack_or_skip}.')
 
 
 def smooth_layers(layers,

diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -289,6 +289,8 @@ def main(model_name: str,
     if inferred_model_format.find('awq') != -1:
         cfg.weight_type = 'int4'
         output_format = 'w4'
+        if 'xcomposer2' in inferred_model_format:
+            output_format = 'plora-w4'
         assert group_size > 0, f'group_size: {group_size} should > 0'
     else:
         output_format = update_output_format(model_name, inferred_model_format,

diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -11,3 +11,4 @@
 from .qwen import QwenModel  # noqa: F401
 from .qwen_awq import QwenAwqModel  # noqa: F401
 from .xcomposer2 import Xcomposer2Model  # noqa: F401
+from .xcomposer2_awq import Xcomposer2AwqModel  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
@@ -2,8 +2,9 @@
 import torch
 
 from .base import INPUT_MODELS
-from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader
 from .llama_awq import ensure_fp16orint32
+from .xcomposer2 import Xcomposer2Model, Xcomposer2Reader
+
 
 class Xcomposer2AwqReader(Xcomposer2Reader):
     """LlamaAwqReader."""
@@ -48,7 +49,6 @@ def attn_lora_b(self, i):
         """Get attn lora_b."""
         return super()._attn(i, 'Plora_B.weight', 0, 0)
 
-
     def ffn(self, i: int):
         """Get ffn qweight for layer i."""
         return ensure_fp16orint32(self._ffn(i, 'qweight'))
@@ -61,6 +61,7 @@ def ffn_scale(self, i: int):
         """Get ffn scales for layer i."""
         return ensure_fp16orint32(self._ffn(i, 'scales'))
 
+
 @INPUT_MODELS.register_module(name='xcomposer2-awq')
 class Xcomposer2AwqModel(Xcomposer2Model):
     """Llama Awq model in hf format."""

diff --git a/lmdeploy/turbomind/deploy/target_model/__init__.py b/lmdeploy/turbomind/deploy/target_model/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .fp import TurbomindModel  # noqa: F401
 from .plora import TurbomindPloraModel  # noqa: F401
+from .plora_w4 import TurbomindPloraW4Model  # noqa: F401
 from .w4 import TurbomindW4Model  # noqa: F401
diff --git a/lmdeploy/turbomind/deploy/target_model/plora_w4.py b/lmdeploy/turbomind/deploy/target_model/plora_w4.py
@@ -1,13 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
-
-import torch
-
 from ..source_model.base import BaseInputModel, BaseReader
-from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
-                   merge_qkv, permute)
+from .base import OUTPUT_MODELS, TurbomindModelConfig, merge_qkv, permute
 from .plora import TurbomindPloraModel, transpose_tensor
-from .w4 import get_cuda_tensor, tp_m_s4, transpose_qk_s4, fuse_w1_w3_s4, convert_s4
+from .w4 import (convert_s4, fuse_w1_w3_s4, get_cuda_tensor, tp_m_s4,
+                 transpose_qk_s4)
+
 
 @OUTPUT_MODELS.register_module(name=['plora-w4'])
 class TurbomindPloraW4Model(TurbomindPloraModel):
@@ -71,7 +68,6 @@ def export_transformer_block(self, bin: BaseReader, i: int):
         self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
         self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
 
-
         q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
         if q_b is not None:
             q_b = permute(q_b, size_per_head)
@@ -97,7 +93,6 @@ def export_transformer_block(self, bin: BaseReader, i: int):
         self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
         self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
 
-
         # attn lora_a
         lora_a_qkv, lora_a_o = bin.attn_lora_a(i)
         lora_a_qkv, lora_a_o = transpose_tensor([lora_a_qkv, lora_a_o])

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -264,14 +264,16 @@ def _from_hf(self, model_source: ModelSource, model_path: str,
             output_format = 'w4'
             data_type = 'int4'
             cfg.group_size = 128
+            if inferred_model_format == 'xcomposer2-awq':
+                output_format = 'plora-w4'
         else:
             output_format = update_output_format(cfg.model_name,
                                                  inferred_model_format,
                                                  model_path, output_format)
             data_type = output_format
             update_config_weight_type(output_format, cfg)
-        if inferred_model_format == 'xcomposer2':
-            output_format = 'plora'
+            if inferred_model_format == 'xcomposer2':
+                output_format = 'plora'
 
         input_model = INPUT_MODELS.get(inferred_model_format)(
             model_path=model_path, tokenizer_path=model_path, ckpt_path=None)

diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -156,7 +156,7 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
     }
 
     if (weights.lora.r > 0) {
-        FT_CHECK(bit_size >= 16);
+        // FT_CHECK(bit_size >= 16);
         deviceMalloc((T**)&weights.lora.a, weights.input_dims * weights.lora.r);
         deviceMalloc((T**)&weights.lora.b, weights.lora.r * weights.output_dims);
     }
@@ -203,7 +203,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
     }
 
     if (weights.lora.r) {
-        FT_CHECK(bit_size >= 16);
+        // FT_CHECK(bit_size >= 16);
         auto        n       = prefix.rfind(".");
         std::string _prefix = prefix.substr(0, n);
         std::string _num    = prefix.substr(n + 1);