[side-effect] bring back quantization of qwen2-vl, glm4v and etc. (#2954

) * bring back quantization of qwen2-vl, glm4v and etc. * fix typo * update log
InternLM · Dec 26, 2024 · 4e5cc16 · 4e5cc16
1 parent d9b8372
commit 4e5cc16
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 7 deletions.
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
@@ -128,7 +128,8 @@ def check_vl_llm(config: dict) -> bool:
         return True
     elif arch == 'MultiModalityCausalLM' and 'language_config' in config:
         return True
-    elif arch == 'ChatGLMModel' and 'vision_config' in config:
+    elif arch in ['ChatGLMModel', 'ChatGLMForConditionalGeneration'
+                  ] and 'vision_config' in config:
         return True
     elif arch in supported_archs:
         return True

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
@@ -254,9 +254,9 @@ def calibrate(model: str,
     elif model_type == 'vlm':
         vl_model = load_vl_model(model, backend=None, with_llm=True).vl_model
         model = vl_model
-        if hasattr(vl_model, 'language_model'):  # deepseek vl
+        if hasattr(vl_model, 'language_model'):  # deepseek-vl, ...
             model = vl_model.language_model
-        if hasattr(vl_model, 'llm'):  # MiniCPMV
+        if hasattr(vl_model, 'llm'):  # MiniCPMV, ...
             model = vl_model.llm
         model.config.use_cache = False
         if dtype == 'float16':

diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
@@ -73,8 +73,8 @@ def load_vl_model(model_path: str,
                 if backend == 'turbomind' or with_llm:
                     model.build_model()
                 return model
-        except Exception:
-            logger.error(f'matching vision model: {name} failed')
+        except Exception as e:
+            logger.error(f'build vision model {name} failed, {e}')
             raise
 
     raise ValueError(f'unsupported vl model with config {hf_config}')
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -27,6 +27,14 @@ def build_preprocessor(self):
         patch_size = self.hf_config.vision_config['patch_size']
         self.n_token_per_image = 2 + (image_size // patch_size // 2)**2
 
+    def build_model(self):
+        if self.with_llm:
+            from transformers import AutoModelForCausalLM
+            self.vl_model = AutoModelForCausalLM.from_pretrained(
+                self.model_path, device_map='cpu', trust_remote_code=True)
+        else:
+            raise NotImplementedError('turbomind has not supported cogvlm yet')
+
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """refer to the spec of `super().preprocess`"""
         images = self.collect_images(messages)

diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
@@ -13,13 +13,13 @@
 class GLM4VisionModel(VisonModel):
     """glm-4v-9b vision model."""
 
-    _arch = 'ChatGLMModel'
+    _arch = ['ChatGLMModel', 'ChatGLMForConditionalGeneration']
 
     @classmethod
     def match(cls, config: AutoConfig):
         """check whether the config match the model."""
         arch = config.architectures[0]
-        if arch == cls._arch and hasattr(config, 'vision_config'):
+        if arch in cls._arch and hasattr(config, 'vision_config'):
             return True
         return False
 
@@ -37,6 +37,14 @@ def build_preprocessor(self):
         patch_size = self.hf_config.vision_config['patch_size']
         self.n_token_per_image = 2 + (image_size // patch_size // 2)**2
 
+    def build_model(self):
+        if self.with_llm:
+            from transformers import AutoModelForCausalLM
+            self.vl_model = AutoModelForCausalLM.from_pretrained(
+                self.model_path, device_map='cpu', trust_remote_code=True)
+        else:
+            raise NotImplementedError('turbomind has not supported glm4v yet')
+
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """refers to the spec of `super.preprocess()"""
         outputs = []

diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
@@ -5,6 +5,15 @@
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
 
 
+def check_transformers():
+    try:
+        from transformers import MllamaForConditionalGeneration  # noqa: F401
+    except ImportError:
+        raise ImportError(
+            'please install latest transformers by '
+            'pip install git+https://github.com/huggingface/transformers.git')
+
+
 @VISION_MODELS.register_module()
 class MllamaVLModel(VisonModel):
     """llama3.2 model."""
@@ -31,6 +40,16 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
+    def build_model(self):
+        check_transformers()
+        if self.with_llm:
+            from transformers import MllamaForConditionalGeneration
+            model = MllamaForConditionalGeneration.from_pretrained(
+                self.model_path, device_map='cpu')
+            self.vl_model = model
+        else:
+            raise NotImplementedError('turbomind has not supported mllama yet')
+
     @staticmethod
     def proc_messages(messages, chat_template, sequence_start):
         """apply chat template to get the prompt."""

diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py
@@ -21,6 +21,14 @@ def build_preprocessor(self):
             processor.tokenizer = None
         self.processor = processor
 
+    def build_model(self):
+        if self.with_llm:
+            from transformers import AutoModelForCausalLM
+            self.vl_model = AutoModelForCausalLM.from_pretrained(
+                self.model_path, device_map='cpu', trust_remote_code=True)
+        else:
+            raise NotImplementedError('turbomind has not supported phi3v yet')
+
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)

diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
@@ -64,6 +64,16 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
+    def build_model(self):
+        check_qwen_vl_deps_install()
+        from transformers import Qwen2VLForConditionalGeneration
+        if self.with_llm:
+            self.vl_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                self.model_path, device_map='cpu')
+        else:
+            raise NotImplementedError(
+                'turbomind has not supported qwen2-vl yet')
+
     @torch.no_grad()
     def forward(self,
                 messages: List[Dict],