Skip to content

Commit

Permalink
Image2structure - new metrics (#2349)
Browse files Browse the repository at this point in the history
  • Loading branch information
JosselinSomervilleRoberts authored Feb 11, 2024
1 parent 637dfa9 commit be0b1e3
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 145 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ image2structure =
latex~=0.7.0
pdf2image~=1.16.3
opencv-python~=4.7.0.68
lpips~=0.1.4

heim =
# HEIM scenarios
Expand Down
101 changes: 85 additions & 16 deletions src/helm/benchmark/metrics/vision_language/image_metrics.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,60 @@
from typing import List, Dict
from typing import List, Dict, Optional
import numpy as np
from torchvision import transforms
from abc import ABC, abstractmethod

from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
from helm.common.images_utils import open_image
from helm.common.gpu_utils import get_torch_device
from helm.common.request import RequestResult
from helm.benchmark.adaptation.request_state import RequestState
from helm.common.media_object import MediaObject
from helm.common.optional_dependencies import handle_module_not_found_error
from ..metric_name import MetricName
from ..statistic import Stat
from .image_utils import preprocess_image, earth_movers_distance, pixel_similarity, sift_similarity
from .image_utils import preprocess_image, earth_mover_similarity, pixel_similarity, sift_similarity

try:
from PIL.Image import Image, open as open_image
from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
from PIL import Image
except ModuleNotFoundError as e:
handle_module_not_found_error(e, suggestions=["images"])
handle_module_not_found_error(e, suggestions=["image2structure"])


class CompilationError(Exception):
pass


class ImageMetric(EvaluateInstancesMetric, ABC):
"""Abstract class for image metrics.
This class is designed to evaluate metrics on images that should be generated using the text
ouput of the model, such as LaTeX, HTML, etc.
The class provides a method to compile the completion into an image and then evaluate the
similarity between the generated image and the reference image using different metrics.
In addition to the metrics, the class also provides a metric to evaluate the compilation success.
If the compilation fails, the similarity metrics are not evaluated and are all set to the most
dissimilar value.
"""

COMPILE_METRIC: str = "compilation_success"
EARTH_MOVER_DISTANCE: str = "earth_mover_distance"
EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
PIXEL_SIMILARITY: str = "pixel_similarity"
SIFT_SIMILARITY: str = "sift_similarity"
LPIPS_SIMILARITY: str = "lpips_similarity"

def __init__(self, metric_names: List[str]):
def __init__(self, metric_names: List[str], normalize_by_white_score: bool = False):
self._metric_names: List[str] = metric_names
self._lpips_metric: Optional[LearnedPerceptualImagePatchSimilarity] = None
self._device = get_torch_device()
self._normalize_by_white_score = normalize_by_white_score

@abstractmethod
def compile_completion_into_image(self, request_state: RequestState, completion: str, ref_image: Image) -> Image:
def compile_completion_into_image(
self, request_state: RequestState, completion: str, ref_image: Image.Image
) -> Image.Image:
raise NotImplementedError

def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
Expand All @@ -45,7 +68,7 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
assert len(reference.output.multimedia_content.media_objects) > 0
ref_media_object: MediaObject = reference.output.multimedia_content.media_objects[0]
assert ref_media_object.type == "image"
ref_image: Image
ref_image: Image.Image
rgb_ref_image: np.ndarray
gray_ref_image: np.ndarray
if ref_media_object.is_local_file and ref_media_object.location is not None:
Expand All @@ -57,6 +80,13 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
"Remote images are not supported in metrics. "
"Images should be downloaded when constructing the instance."
)
white_image: Optional[Image.Image] = None
rgb_white_image: Optional[np.ndarray] = None
gray_white_image: Optional[np.ndarray] = None
if self._normalize_by_white_score:
white_image = Image.new("RGB", ref_image.size, (255, 255, 255))
rgb_white_image = np.array(white_image)
gray_white_image = preprocess_image(white_image)

assert request_state.result is not None
request_result: RequestResult = request_state.result
Expand All @@ -66,9 +96,9 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
]

for completion in completions:
image: Image
image: Image.Image
try:
image = self.compile_completion_into_image(request_state, completion, ref_image)
image = self.compile_completion_into_image(request_state, completion, ref_image).convert("RGB")
except CompilationError:
stats_dict[self.COMPILE_METRIC].add(0) # Did not compile
# For all other metrics, we set the value to zero
Expand All @@ -79,12 +109,51 @@ def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
rgb_image: np.ndarray = np.array(image)
gray_image: np.ndarray = preprocess_image(image)

if self.PIXEL_SIMILARITY in self._metric_names:
stats_dict[self.PIXEL_SIMILARITY].add(pixel_similarity(gray_image, gray_ref_image))
if self.SIFT_SIMILARITY in self._metric_names:
stats_dict[self.SIFT_SIMILARITY].add(sift_similarity(rgb_image, rgb_ref_image))
if self.EARTH_MOVER_DISTANCE in self._metric_names:
stats_dict[self.EARTH_MOVER_DISTANCE].add(earth_movers_distance(gray_image, gray_ref_image))
metric_runs: list = [
[self.PIXEL_SIMILARITY, pixel_similarity, gray_image, gray_ref_image, gray_white_image, True],
[self.SIFT_SIMILARITY, sift_similarity, rgb_image, rgb_ref_image, rgb_white_image, False],
[
self.EARTH_MOVER_SIMILARITY,
earth_mover_similarity,
gray_image,
gray_ref_image,
gray_white_image,
True,
],
[self.LPIPS_SIMILARITY, self.lpips_similarity, image, ref_image, white_image, True],
]

for metric_name, metric_fn, image1, image2, white_image, can_compute_on_white in metric_runs:
value: float = metric_fn(image1, image2)
if self._normalize_by_white_score and can_compute_on_white:
assert white_image is not None
value_white: float = metric_fn(image2, white_image)
value = (value - value_white) / (1.0 - value_white)
stats_dict[metric_name].add(value)

stats_dict[self.COMPILE_METRIC].add(1) # Compiled

return list(stats_dict.values())

def lpips_similarity(self, generated_image: Image.Image, reference_image: Image.Image) -> float:
if self._lpips_metric is None:
self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg").to(self._device)

preprocessing = transforms.Compose(
[
transforms.Resize((256, 256)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
]
)
generated_image_tensor = preprocessing(generated_image)
reference_image_tensor = preprocessing(reference_image)

# Add batch dimension (B, C, H, W) since torchmetrics expects batches
img1 = generated_image_tensor.unsqueeze(0).to(self._device)
img2 = reference_image_tensor.unsqueeze(0).to(self._device)

# Compute the LPIPS score
assert self._lpips_metric is not None
score: float = self._lpips_metric(img1, img2).detach().item()
return score
59 changes: 12 additions & 47 deletions src/helm/benchmark/metrics/vision_language/image_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import warnings
from scipy.stats import wasserstein_distance
import numpy as np

Expand All @@ -10,31 +9,15 @@
except ModuleNotFoundError as e:
handle_module_not_found_error(e, suggestions=["image2structure"])

##
# Globals
##

warnings.filterwarnings("ignore")

# specify resized image sizes
height = 2**10
width = 2**10

##
# Functions
##


def preprocess_image(image: Image, norm_exposure: bool = True) -> np.ndarray:
def preprocess_image(image: Image) -> np.ndarray:
"""Preprocesses an image for use in metrics.
Returns a grayscale image stored using int in a numpy array.
Also normalizes the exposure of the image.
"""
image = image.convert("L")
np_image = np.array(image)
assert np_image.dtype == np.uint8
if norm_exposure:
np_image = normalize_exposure(np_image)
return np_image


Expand All @@ -45,56 +28,38 @@ def get_histogram(img: np.ndarray) -> np.ndarray:
the percent of the pixels in the image with the given darkness level.
The histogram's values sum to 1.
"""
hist, _ = np.histogram(img, bins=256, range=(0, 255))
hist, _ = np.histogram(img, bins=256, range=(0, 256))
hist = hist.astype(float) / img.size # Normalize the histogram
return hist


def normalize_exposure(img: np.ndarray) -> np.ndarray:
def earth_mover_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
"""
Normalize the exposure of an image using numpy for efficiency.
"""
img = img.astype(int)
hist, _ = np.histogram(img, bins=256, range=(0, 255))
hist = hist.astype(float) / img.size # Normalize histogram

# Compute the CDF using numpy's cumsum function
cdf = np.cumsum(hist)
# Normalize the CDF
cdf_normalized = np.uint8(255 * cdf / cdf[-1])

# Use numpy's fancy indexing for normalization of the image
normalized = cdf_normalized[img] # type: ignore

return normalized.astype(int)


def earth_movers_distance(img_a: np.ndarray, img_b: np.ndarray) -> float:
"""
Measure the Earth Mover's distance between two images
Measure the 1 - Earth Mover's distance between two images
Args:
img_a (PIL.Image): the first image
img_b (PIL.Image): the second image
img_a (np.ndarray): the first image
img_b (np.ndarray): the second image
Returns:
float: the Earth Mover's distance between the images
"""
hist_a = get_histogram(img_a)
hist_b = get_histogram(img_b)
return wasserstein_distance(hist_a, hist_b)
return 1.0 - wasserstein_distance(hist_a, hist_b)


def pixel_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
"""
Measure the pixel-level similarity between two images
Args:
img_a (PIL.Image): the first image
img_b (PIL.Image): the second image
img_a (np.ndarray): the first image
img_b (np.ndarray): the second image
Returns:
float: the pixel-level similarity between the images
"""
return np.sum(np.absolute(img_a - img_b)) / (height * width) / 255
height, width = img_a.shape
return 1.0 - np.sum(np.abs(img_a - img_b)) / (height * width * 255)


def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
Expand All @@ -118,7 +83,7 @@ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
_, desc_b = orb.detectAndCompute(img_b, None)

# Initialize the brute force matcher
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)

# Match descriptors.
matches = bf.match(desc_a, desc_b)
Expand Down
60 changes: 0 additions & 60 deletions src/helm/benchmark/static/schema_classic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -534,27 +534,6 @@ metrics:
description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
lower_is_better: false

# Vision Language metrics:
- name: earth_mover_distance
display_name: Earth Mover Distance
short_display_name: EMD
description: Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
lower_is_better: true
- name: pixel_similarity
display_name: Pixel Similarity
short_display_name: PS
description: Pixel Similarity between an image generated by the model and the target image.
lower_is_better: false
- name: sift_similarity
display_name: SIFT Similarity
short_display_name: SIFT
description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
lower_is_better: false
- name: compilation_success
display_name: Compilation Success
description: Fraction of instances where the generated code compiles successfully.
lower_is_better: false

# CLEVA (Chinese) metrics:
# Accuracy metrics (Chinese)
- name: chinese_ibleu
Expand Down Expand Up @@ -868,19 +847,6 @@ metric_groups:
- name: classification_micro_f1
split: ${main_split}

# Vision-Language metrics
- name: image_generation
display_name: Image generation
metrics:
- name: earth_mover_distance
split: ${main_split}
- name: pixel_similarity
split: ${main_split}
- name: sift_similarity
split: ${main_split}
- name: compilation_success
split: ${main_split}

- name: cleva_paraphrase_generation_metrics
display_name: CLEVA (Chinese) paraphrase generation metrics
metrics:
Expand Down Expand Up @@ -1223,14 +1189,6 @@ run_groups:
category: Targeted evaluations
visibility: this_group_only

## Vision Language
- name: image2structure
display_name: Image2Structure
description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
category: Targeted evaluations
subgroups:
- image2latex

### Chinese
- name: chinese_cleva
display_name: CLEVA (Chinese) scenarios
Expand Down Expand Up @@ -2274,24 +2232,6 @@ run_groups:
when: n/a
language: synthetic

## Vision-Language Scenarios
- name: image2latex
display_name: Image2LaTeX
description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
metric_groups:
- image_generation
- efficiency
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: image-to-text
what: mathematical equations, tables, algorithms, tikz
who: n/a
when: "2024"
language: English

## CLEVA (Chinese) Scenarios
# Applications
- name: cleva_closed_book_question_answering
Expand Down
Loading

0 comments on commit be0b1e3

Please sign in to comment.