Image2structure - new metrics (#2349)

stanford-crfm · Feb 11, 2024 · be0b1e3 · be0b1e3
1 parent 637dfa9
commit be0b1e3
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 145 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -158,6 +158,7 @@ image2structure =
     latex~=0.7.0
     pdf2image~=1.16.3
     opencv-python~=4.7.0.68
+    lpips~=0.1.4
 
 heim =
     # HEIM scenarios

diff --git a/src/helm/benchmark/metrics/vision_language/image_metrics.py b/src/helm/benchmark/metrics/vision_language/image_metrics.py
@@ -1,37 +1,60 @@
-from typing import List, Dict
+from typing import List, Dict, Optional
 import numpy as np
+from torchvision import transforms
 from abc import ABC, abstractmethod
 
 from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
+from helm.common.images_utils import open_image
+from helm.common.gpu_utils import get_torch_device
 from helm.common.request import RequestResult
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.common.media_object import MediaObject
 from helm.common.optional_dependencies import handle_module_not_found_error
 from ..metric_name import MetricName
 from ..statistic import Stat
-from .image_utils import preprocess_image, earth_movers_distance, pixel_similarity, sift_similarity
+from .image_utils import preprocess_image, earth_mover_similarity, pixel_similarity, sift_similarity
 
 try:
-    from PIL.Image import Image, open as open_image
+    from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+    from PIL import Image
 except ModuleNotFoundError as e:
-    handle_module_not_found_error(e, suggestions=["images"])
+    handle_module_not_found_error(e, suggestions=["image2structure"])
 
 
 class CompilationError(Exception):
     pass
 
 
 class ImageMetric(EvaluateInstancesMetric, ABC):
+    """Abstract class for image metrics.
+
+    This class is designed to evaluate metrics on images that should be generated using the text
+    ouput of the model, such as LaTeX, HTML, etc.
+
+    The class provides a method to compile the completion into an image and then evaluate the
+    similarity between the generated image and the reference image using different metrics.
+
+    In addition to the metrics, the class also provides a metric to evaluate the compilation success.
+    If the compilation fails, the similarity metrics are not evaluated and are all set to the most
+    dissimilar value.
+    """
+
     COMPILE_METRIC: str = "compilation_success"
-    EARTH_MOVER_DISTANCE: str = "earth_mover_distance"
+    EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
     PIXEL_SIMILARITY: str = "pixel_similarity"
     SIFT_SIMILARITY: str = "sift_similarity"
+    LPIPS_SIMILARITY: str = "lpips_similarity"
 
-    def __init__(self, metric_names: List[str]):
+    def __init__(self, metric_names: List[str], normalize_by_white_score: bool = False):
         self._metric_names: List[str] = metric_names
+        self._lpips_metric: Optional[LearnedPerceptualImagePatchSimilarity] = None
+        self._device = get_torch_device()
+        self._normalize_by_white_score = normalize_by_white_score
 
     @abstractmethod
-    def compile_completion_into_image(self, request_state: RequestState, completion: str, ref_image: Image) -> Image:
+    def compile_completion_into_image(
+        self, request_state: RequestState, completion: str, ref_image: Image.Image
+    ) -> Image.Image:
         raise NotImplementedError
 
     def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
@@ -45,7 +68,7 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
             assert len(reference.output.multimedia_content.media_objects) > 0
             ref_media_object: MediaObject = reference.output.multimedia_content.media_objects[0]
             assert ref_media_object.type == "image"
-            ref_image: Image
+            ref_image: Image.Image
             rgb_ref_image: np.ndarray
             gray_ref_image: np.ndarray
             if ref_media_object.is_local_file and ref_media_object.location is not None:
@@ -57,6 +80,13 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
                     "Remote images are not supported in metrics. "
                     "Images should be downloaded when constructing the instance."
                 )
+            white_image: Optional[Image.Image] = None
+            rgb_white_image: Optional[np.ndarray] = None
+            gray_white_image: Optional[np.ndarray] = None
+            if self._normalize_by_white_score:
+                white_image = Image.new("RGB", ref_image.size, (255, 255, 255))
+                rgb_white_image = np.array(white_image)
+                gray_white_image = preprocess_image(white_image)
 
             assert request_state.result is not None
             request_result: RequestResult = request_state.result
@@ -66,9 +96,9 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
             ]
 
             for completion in completions:
-                image: Image
+                image: Image.Image
                 try:
-                    image = self.compile_completion_into_image(request_state, completion, ref_image)
+                    image = self.compile_completion_into_image(request_state, completion, ref_image).convert("RGB")
                 except CompilationError:
                     stats_dict[self.COMPILE_METRIC].add(0)  # Did not compile
                     # For all other metrics, we set the value to zero
@@ -79,12 +109,51 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
                 rgb_image: np.ndarray = np.array(image)
                 gray_image: np.ndarray = preprocess_image(image)
 
-                if self.PIXEL_SIMILARITY in self._metric_names:
-                    stats_dict[self.PIXEL_SIMILARITY].add(pixel_similarity(gray_image, gray_ref_image))
-                if self.SIFT_SIMILARITY in self._metric_names:
-                    stats_dict[self.SIFT_SIMILARITY].add(sift_similarity(rgb_image, rgb_ref_image))
-                if self.EARTH_MOVER_DISTANCE in self._metric_names:
-                    stats_dict[self.EARTH_MOVER_DISTANCE].add(earth_movers_distance(gray_image, gray_ref_image))
+                metric_runs: list = [
+                    [self.PIXEL_SIMILARITY, pixel_similarity, gray_image, gray_ref_image, gray_white_image, True],
+                    [self.SIFT_SIMILARITY, sift_similarity, rgb_image, rgb_ref_image, rgb_white_image, False],
+                    [
+                        self.EARTH_MOVER_SIMILARITY,
+                        earth_mover_similarity,
+                        gray_image,
+                        gray_ref_image,
+                        gray_white_image,
+                        True,
+                    ],
+                    [self.LPIPS_SIMILARITY, self.lpips_similarity, image, ref_image, white_image, True],
+                ]
+
+                for metric_name, metric_fn, image1, image2, white_image, can_compute_on_white in metric_runs:
+                    value: float = metric_fn(image1, image2)
+                    if self._normalize_by_white_score and can_compute_on_white:
+                        assert white_image is not None
+                        value_white: float = metric_fn(image2, white_image)
+                        value = (value - value_white) / (1.0 - value_white)
+                    stats_dict[metric_name].add(value)
+
                 stats_dict[self.COMPILE_METRIC].add(1)  # Compiled
 
         return list(stats_dict.values())
+
+    def lpips_similarity(self, generated_image: Image.Image, reference_image: Image.Image) -> float:
+        if self._lpips_metric is None:
+            self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg").to(self._device)
+
+        preprocessing = transforms.Compose(
+            [
+                transforms.Resize((256, 256)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
+            ]
+        )
+        generated_image_tensor = preprocessing(generated_image)
+        reference_image_tensor = preprocessing(reference_image)
+
+        # Add batch dimension (B, C, H, W) since torchmetrics expects batches
+        img1 = generated_image_tensor.unsqueeze(0).to(self._device)
+        img2 = reference_image_tensor.unsqueeze(0).to(self._device)
+
+        # Compute the LPIPS score
+        assert self._lpips_metric is not None
+        score: float = self._lpips_metric(img1, img2).detach().item()
+        return score
diff --git a/src/helm/benchmark/metrics/vision_language/image_utils.py b/src/helm/benchmark/metrics/vision_language/image_utils.py
@@ -1,4 +1,3 @@
-import warnings
 from scipy.stats import wasserstein_distance
 import numpy as np
 
@@ -10,31 +9,15 @@
 except ModuleNotFoundError as e:
     handle_module_not_found_error(e, suggestions=["image2structure"])
 
-##
-# Globals
-##
 
-warnings.filterwarnings("ignore")
-
-# specify resized image sizes
-height = 2**10
-width = 2**10
-
-##
-# Functions
-##
-
-
-def preprocess_image(image: Image, norm_exposure: bool = True) -> np.ndarray:
+def preprocess_image(image: Image) -> np.ndarray:
     """Preprocesses an image for use in metrics.
     Returns a grayscale image stored using int in a numpy array.
     Also normalizes the exposure of the image.
     """
     image = image.convert("L")
     np_image = np.array(image)
     assert np_image.dtype == np.uint8
-    if norm_exposure:
-        np_image = normalize_exposure(np_image)
     return np_image
 
 
@@ -45,56 +28,38 @@ def get_histogram(img: np.ndarray) -> np.ndarray:
     the percent of the pixels in the image with the given darkness level.
     The histogram's values sum to 1.
     """
-    hist, _ = np.histogram(img, bins=256, range=(0, 255))
+    hist, _ = np.histogram(img, bins=256, range=(0, 256))
     hist = hist.astype(float) / img.size  # Normalize the histogram
     return hist
 
 
-def normalize_exposure(img: np.ndarray) -> np.ndarray:
+def earth_mover_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
     """
-    Normalize the exposure of an image using numpy for efficiency.
-    """
-    img = img.astype(int)
-    hist, _ = np.histogram(img, bins=256, range=(0, 255))
-    hist = hist.astype(float) / img.size  # Normalize histogram
-
-    # Compute the CDF using numpy's cumsum function
-    cdf = np.cumsum(hist)
-    # Normalize the CDF
-    cdf_normalized = np.uint8(255 * cdf / cdf[-1])
-
-    # Use numpy's fancy indexing for normalization of the image
-    normalized = cdf_normalized[img]  # type: ignore
-
-    return normalized.astype(int)
-
-
-def earth_movers_distance(img_a: np.ndarray, img_b: np.ndarray) -> float:
-    """
-    Measure the Earth Mover's distance between two images
+    Measure the 1 - Earth Mover's distance between two images
 
     Args:
-        img_a (PIL.Image): the first image
-        img_b (PIL.Image): the second image
+        img_a (np.ndarray): the first image
+        img_b (np.ndarray): the second image
     Returns:
         float: the Earth Mover's distance between the images
     """
     hist_a = get_histogram(img_a)
     hist_b = get_histogram(img_b)
-    return wasserstein_distance(hist_a, hist_b)
+    return 1.0 - wasserstein_distance(hist_a, hist_b)
 
 
 def pixel_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
     """
     Measure the pixel-level similarity between two images
 
     Args:
-        img_a (PIL.Image): the first image
-        img_b (PIL.Image): the second image
+        img_a (np.ndarray): the first image
+        img_b (np.ndarray): the second image
     Returns:
         float: the pixel-level similarity between the images
     """
-    return np.sum(np.absolute(img_a - img_b)) / (height * width) / 255
+    height, width = img_a.shape
+    return 1.0 - np.sum(np.abs(img_a - img_b)) / (height * width * 255)
 
 
 def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
@@ -118,7 +83,7 @@ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
     _, desc_b = orb.detectAndCompute(img_b, None)
 
     # Initialize the brute force matcher
-    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
+    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
 
     # Match descriptors.
     matches = bf.match(desc_a, desc_b)

diff --git a/src/helm/benchmark/static/schema_classic.yaml b/src/helm/benchmark/static/schema_classic.yaml
@@ -534,27 +534,6 @@ metrics:
     description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
     lower_is_better: false
 
-  # Vision Language metrics:
-  - name: earth_mover_distance
-    display_name: Earth Mover Distance
-    short_display_name: EMD
-    description: Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
-    lower_is_better: true
-  - name: pixel_similarity
-    display_name: Pixel Similarity
-    short_display_name: PS
-    description: Pixel Similarity between an image generated by the model and the target image.
-    lower_is_better: false
-  - name: sift_similarity
-    display_name: SIFT Similarity
-    short_display_name: SIFT
-    description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
-    lower_is_better: false
-  - name: compilation_success
-    display_name: Compilation Success
-    description: Fraction of instances where the generated code compiles successfully.
-    lower_is_better: false
-
   # CLEVA (Chinese) metrics:
   # Accuracy metrics (Chinese)
   - name: chinese_ibleu
@@ -868,19 +847,6 @@ metric_groups:
       - name: classification_micro_f1
         split: ${main_split}
 
-  # Vision-Language metrics
-  - name: image_generation
-    display_name: Image generation
-    metrics:
-      - name: earth_mover_distance
-        split: ${main_split}
-      - name: pixel_similarity
-        split: ${main_split}
-      - name: sift_similarity
-        split: ${main_split}
-      - name: compilation_success
-        split: ${main_split}
-
   - name: cleva_paraphrase_generation_metrics
     display_name: CLEVA (Chinese) paraphrase generation metrics
     metrics:
@@ -1223,14 +1189,6 @@ run_groups:
     category: Targeted evaluations
     visibility: this_group_only
 
-## Vision Language
-  - name: image2structure
-    display_name: Image2Structure
-    description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
-    category: Targeted evaluations
-    subgroups:
-      - image2latex
-
 ### Chinese
   - name: chinese_cleva
     display_name: CLEVA (Chinese) scenarios
@@ -2274,24 +2232,6 @@ run_groups:
       when: n/a
       language: synthetic
 
-## Vision-Language Scenarios
-  - name: image2latex
-    display_name: Image2LaTeX
-    description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
-    metric_groups:
-      - image_generation
-      - efficiency
-      - general_information
-    environment:
-      main_name: exact_match
-      main_split: test
-    taxonomy:
-      task: image-to-text
-      what: mathematical equations, tables, algorithms, tikz
-      who: n/a
-      when: "2024"
-      language: English
-
 ## CLEVA (Chinese) Scenarios
 # Applications
   - name: cleva_closed_book_question_answering