Update H2OVL

Signed-off-by: DarkLight1337 <[email protected]>
vllm-project · Jan 30, 2025 · ddf55c2 · ddf55c2
1 parent a47f92a
commit ddf55c2
Show file tree

Hide file tree

Showing 7 changed files with 490 additions and 386 deletions.
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -650,7 +650,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
   *
   * ✅︎
-  *
+  * ✅︎
 - * `Idefics3ForConditionalGeneration`
   * Idefics3
   * T + I

diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -6,7 +6,7 @@
 from transformers import AutoConfig
 
 # Import the functions to test
-from vllm.model_executor.models.h2ovl import (calculate_targets,
+from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
                                               image_to_pixel_values_wrapper)
 from vllm.multimodal.image import rescale_image_size
 
@@ -27,16 +27,16 @@ def run_preprocessing_test(
         max_dynamic_patch = config.max_dynamic_patch
 
     width, height = image.size
-    use_MSAC = config.use_msac
+    use_msac = config.use_msac
 
     # Create the mapper function with the provided configuration
-    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
+    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_msac)
     pixel_values = mapper(image)
 
     # Calculate the expected number of blocks
-    if use_MSAC:
+    if use_msac:
         # First pass
-        blocks1, _, _, aspect_ratio = calculate_targets(
+        blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
             width,
             height,
             config.min_dynamic_patch,
@@ -47,7 +47,7 @@ def run_preprocessing_test(
         )
 
         # Second pass
-        blocks2, _, _, _ = calculate_targets(
+        blocks2, _, _, _ = calculate_h2ovl_targets(
             width,
             height,
             config.min_dynamic_patch,
@@ -68,7 +68,7 @@ def run_preprocessing_test(
         expected_blocks = total_blocks
 
     else:
-        blocks, _, _, _ = calculate_targets(
+        blocks, _, _, _ = calculate_h2ovl_targets(
             width,
             height,
             config.min_dynamic_patch,

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -339,6 +339,7 @@ def __init__(self, hf_runner: HfRunner):
                                                      trust_remote_code=True)
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
             self.min_num = self.config.min_dynamic_patch
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
@@ -347,18 +348,19 @@ def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
 
             # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image,
-                                      self.image_size,
-                                      self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail,
-                                      use_MSAC=self.config.use_msac).to(
-                                          self.dtype) for image in images
+                image_to_pixel_values_h2ovl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                    use_msac=self.use_msac,
+                ).to(self.dtype) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values
@@ -406,13 +408,17 @@ def __init__(self, hf_runner: HfRunner):
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
             from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl)
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail).to(self.dtype)
-                for image in images
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ).to(self.dtype) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -146,6 +146,7 @@ def _test_processing_correctness(
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
     "adept/fuyu-8b",
+    "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",