Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Automatic GPT4V Evaluation for VLM Originality Evaluation #2576

Merged
merged 11 commits into from
May 23, 2024
6 changes: 6 additions & 0 deletions src/helm/benchmark/metrics/common_metric_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,9 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS

def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])

ImKeTT marked this conversation as resolved.
Show resolved Hide resolved

def get_gpt4v_originality_metric_specs() -> List[MetricSpec]:
return [
MetricSpec(class_name="helm.benchmark.metrics.gpt4v_originality_metrics.GPT4VOriginalityMetric", args={}),
]
78 changes: 78 additions & 0 deletions src/helm/benchmark/metrics/gpt4v_originality_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import List

from helm.common.gpt4v_originality_request import (
GPT4VOriginalityRequestResult,
)
from helm.common.request import Request, RequestResult, GeneratedOutput
from helm.benchmark.adaptation.request_state import RequestState
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
from helm.common.media_object import MultimediaObject, MediaObject, IMAGE_TYPE
from .metric import Metric
from .metric_name import MetricName
from .metric_service import MetricService
from .statistic import Stat


class GPT4VOriginalityMetric(Metric):
"""
Defines metrics for the originality evaluation based on GPT4V.
"""

GPT4V_ORIGINALITY_MODEL_NAME: str = "openai/gpt-4-vision-preview"

def __init__(self):
super().__init__()

def __repr__(self):
return "GPT4VOriginalityMetric()"

def _make_evaluation_content_from_multimedia(
self, input_media: MultimediaObject, input_text: str
) -> MultimediaObject:
"""
Seperates the image from the multimedia object and returns a new multimedia object with
the image and the given text.
"""
image_object = [item for item in input_media.media_objects if item.is_type(IMAGE_TYPE) and item.location]
text_object = MediaObject(text=input_text, content_type="text/plain")
return MultimediaObject(media_objects=[image_object[0], text_object])

def evaluate_generation(
self,
adapter_spec: AdapterSpec,
request_state: RequestState,
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:
"""
Given the proper prompt, we compute the orignality scores of VLM generated content
given input image and the generated text.
"""
request: Request = request_state.request
# Predicted outputs and their originality scores
assert request_state.result is not None
request_result: RequestResult = request_state.result
# Get input image and generated response for the originality evaluation
assert request.multimodal_prompt is not None
input_media: MultimediaObject = request.multimodal_prompt
completions: List[GeneratedOutput] = request_result.completions

input_text: str = completions[0].text
evaluation_media: MultimediaObject = self._make_evaluation_content_from_multimedia(input_media, input_text)
response: GPT4VOriginalityRequestResult = metric_service.get_gpt4v_originality_scores(
request=Request(model=self.GPT4V_ORIGINALITY_MODEL_NAME, multimodal_prompt=evaluation_media)
)
if not response.success:
raise Exception(f"Failed to get GPT4V originality scores: {response}")

# Extract the originality scores from the response
originality_scores: List[float] = [output.score for output in response.scores]
num_originality_completions: int = len(originality_scores)

max_originality_score: float = max(originality_scores) if len(originality_scores) > 0 else 0
stats: List[Stat] = [
Stat(MetricName("expected_max_originality")).add(max_originality_score),
Stat(MetricName("originality_score")).add(sum(originality_scores) / num_originality_completions),
]

return stats
5 changes: 5 additions & 0 deletions src/helm/benchmark/metrics/metric_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
from helm.common.gpt4v_originality_request import GPT4VOriginalityRequestResult
from helm.common.request import Request
from helm.benchmark.window_services.tokenizer_service import TokenizerService
from helm.proxy.services.service import Service
from helm.common.cache import Cache
Expand All @@ -31,6 +33,9 @@ def upload(self, request: FileUploadRequest) -> FileUploadResult:
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
return self._service.get_toxicity_scores(self._auth, request)

def get_gpt4v_originality_scores(self, request: Request) -> GPT4VOriginalityRequestResult:
return self._service.get_gpt4v_originality_scores(self._auth, request)

def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
return self._service.make_critique_request(self._auth, request)

Expand Down
12 changes: 10 additions & 2 deletions src/helm/benchmark/run_specs/vlm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
get_basic_reference_metric_specs,
get_exact_match_metric_specs,
get_open_ended_generation_metric_specs,
get_gpt4v_originality_metric_specs,
)
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.run_spec import RunSpec, run_spec_function
Expand Down Expand Up @@ -56,6 +57,13 @@ def get_short_answer_generation_adapter_spec():
)


def get_open_end_answer_generation_adapter_spec():
return get_generation_adapter_spec(
instructions="Follow the given instruction and give your complete answer.",
max_tokens=500,
ImKeTT marked this conversation as resolved.
Show resolved Hide resolved
)


def get_multiple_choice_joint_adapter_spec(
input_noun: Optional[str],
output_noun: str,
Expand Down Expand Up @@ -488,8 +496,8 @@ def get_mementos_spec(subject: str) -> RunSpec:
class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
args={"subject": subject},
)
adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
metric_specs: List[MetricSpec] = get_gpt4v_originality_metric_specs()

run_spec_name: str = "mementos"
return RunSpec(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class MementosScenario(Scenario):
"Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
)

ORIGINALITY_QUESTION_PROMPT: str = "Write a creative and original story for the given image sequence."

SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]

name = "mementos"
Expand Down Expand Up @@ -98,7 +100,7 @@ def get_instances(self, output_path: str) -> List[Instance]:

content: List[MediaObject] = [
MediaObject(location=local_image_path, content_type="image/png"),
MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
MediaObject(text=self.ORIGINALITY_QUESTION_PROMPT, content_type="text/plain"),
]
answer: str = row["description"]
instances.append(
Expand Down
23 changes: 23 additions & 0 deletions src/helm/clients/auto_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from helm.clients.moderation_api_client import ModerationAPIClient
from helm.proxy.critique.critique_client import CritiqueClient
from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
from helm.clients.gpt4v_originality_client import GPT4VOriginalityClient
from helm.proxy.retry import NonRetriableException, retry_request
from helm.tokenizers.auto_tokenizer import AutoTokenizer

Expand Down Expand Up @@ -151,6 +152,28 @@ def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("perspectiveapi")
return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)

def get_gpt4v_originality_client(self) -> GPT4VOriginalityClient:
"""Get the GPT4V originality client for image and text input."""

# We only support GPT4V for evaluating the originality scores of VLMs now.
originality_model_deployment_name = "openai/gpt-4-vision-preview"

model_deployment: ModelDeployment = get_model_deployment(originality_model_deployment_name)
host_organization: str = model_deployment.host_organization
cache_config: CacheConfig = self.cache_backend_config.get_cache_config(host_organization)

client_params = {
"cache_config": cache_config,
"tokenizer_name": model_deployment.tokenizer_name,
"api_key": provide_api_key(self.credentials, host_organization, originality_model_deployment_name),
"tokenizer": self._auto_tokenizer._get_tokenizer(
tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
),
"org_id": self.credentials.get(host_organization + "OrgId", None), # OpenAI, GooseAI, Microsoft
}

return GPT4VOriginalityClient(**client_params)

def get_moderation_api_client(self) -> ModerationAPIClient:
"""Get the ModerationAPI client."""
cache_config: CacheConfig = self.cache_backend_config.get_cache_config("ModerationAPI")
Expand Down
189 changes: 189 additions & 0 deletions src/helm/clients/gpt4v_originality_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# mypy: check_untyped_defs = False
from re import search, Match
from typing import Any, Dict, List, Optional, cast, Union

from helm.benchmark.model_metadata_registry import is_vlm
from helm.common.media_object import TEXT_TYPE
from helm.common.request import GeneratedOutput, Token, Request
from helm.common.hierarchical_logger import hlog
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.gpt4v_originality_request import (
wrap_request_time,
GPT4VOriginalityRequestResult,
GPT4VScoreOutput,
)
from helm.common.tokenization_request import (
TokenizationRequest,
TokenizationRequestResult,
)
from .client import truncate_sequence
from .openai_client import OpenAIClient

try:
import openai
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["openai"])


class GPT4VOriginalityClient(OpenAIClient):
# TODO: Design a more structured prompt for evaluation. We need to explain each level of the
# TODO: originality score in one short sentence.
EVALUATION_PROMPT_TEMPLATE = (
"Please rate the generated texts given the image from 1 to 5. Try your best "
"to give only the rating without other explanations.\n{GENERATED_TEXT}"
)
REX_PATTERN = r"[1-5]"

def _is_gpt4v_model_engine(self, model_engine: str) -> bool:
if model_engine.startswith("gpt-4-vision"):
return True
return False

def _reformat_text_request(self, original_text_input: str) -> str:
return self.EVALUATION_PROMPT_TEMPLATE.format(GENERATED_TEXT=original_text_input)

def _convert_completion_to_originality_score(self, completions: GeneratedOutput) -> GPT4VScoreOutput:
# TODO: We might consider improving the extraction process of GPT4V generated score here.
new_text: str = completions.text
match_seq: Optional[Match[str]] = search(self.REX_PATTERN, new_text)
if match_seq:
new_score: float = float(match_seq.group())
else:
raise ValueError(f"Could not find a score in the completion text: {new_text}")
gpt4vscore = GPT4VScoreOutput(score=new_score, logprob=completions.logprob, tokens=completions.tokens)
return gpt4vscore

def _make_scoring_request(self, request: Request) -> GPT4VOriginalityRequestResult:
messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
# Only support multimodal_prompt as the input for now
# Convert prompt into a single message
# For now, put the whole prompt in a single user message, and expect the response
# to be returned in a single assistant message.
# TODO: Support ChatML for creating multiple messages with different roles.
# See: https://github.com/openai/openai-python/blob/main/chatml.md

# Content can either be text or a list of multimodal content made up of text and images:
# https://platform.openai.com/docs/guides/vision
content: Union[str, List[Union[str, Any]]]
if request.multimodal_prompt is not None:
content = []
for media_object in request.multimodal_prompt.media_objects:
if media_object.is_type("image") and media_object.location:
from helm.common.images_utils import encode_base64

base64_image: str = encode_base64(media_object.location)
content.append(
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
)
elif media_object.is_type(TEXT_TYPE):
if media_object.text is None:
raise ValueError("MediaObject of text type has missing text field value")
text_input: str = self._reformat_text_request(media_object.text)
content.append({"type": media_object.type, "text": text_input})
else:
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")

else:
raise ValueError("Input request has missing multimodal prompt value")

messages = [{"role": "user", "content": content}]

# Fixing the most generation parameters here.
raw_request: Dict[str, Any] = {
"model": self._get_model_for_request(request),
"messages": messages,
"temperature": 1.0,
"top_p": 1.0,
"n": 1,
"stop": None,
# Note: Chat models may require adding an extra token to max_tokens
# for the internal special role token.
"max_tokens": 15,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
}

# OpenAI's vision API doesn't allow None values for stop.
# Fails with "body -> stop: none is not an allowed value" if None is passed.
if is_vlm(request.model) and raw_request["stop"] is None:
raw_request.pop("stop")

def do_it() -> Dict[str, Any]:
return self.client.chat.completions.create(**raw_request).model_dump(mode="json")

try:
cache_key = self._get_cache_key(raw_request, request)
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
except openai.OpenAIError as e:
if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
hlog(f"Failed safety check: {str(request)}")
empty_completion = GeneratedOutput(
text="",
logprob=0,
tokens=[],
finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
)
empty_score = GPT4VScoreOutput(
score=0.0,
logprob=0,
tokens=[],
finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
)
return GPT4VOriginalityRequestResult(
success=True,
cached=False,
request_time=0,
completions=[empty_completion] * request.num_completions,
scores=[empty_score] * request.num_completions,
)

error: str = f"OpenAI error: {e}"
return GPT4VOriginalityRequestResult(success=False, cached=False, error=error, completions=[], scores=[])

scores: List[GPT4VScoreOutput] = []
completions: List[GeneratedOutput] = []
for raw_completion in response["choices"]:
# The OpenAI chat completion API doesn't support echo.
# If `echo_prompt` is true, combine the prompt and completion.
raw_completion_content = raw_completion["message"]["content"]
text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
# The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
TokenizationRequest(text, tokenizer=self.tokenizer_name)
)
# Log probs are not currently not supported by the OpenAI chat completion API, so set to 0 for now.
tokens: List[Token] = [
Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
]
completion = GeneratedOutput(
text=text,
logprob=0, # OpenAI does not provide logprobs
tokens=tokens,
finish_reason={"reason": raw_completion["finish_reason"]},
)
# Truncate the text by stop sequences
truncated_completion: GeneratedOutput = truncate_sequence(completion, request)
completions.append(truncated_completion)
# Convert the completion to originality score output
scores.append(self._convert_completion_to_originality_score(truncated_completion))

return GPT4VOriginalityRequestResult(
success=True,
cached=cached,
request_time=response["request_time"],
request_datetime=response.get("request_datetime"),
scores=scores,
completions=completions,
)

def get_originality_scores(self, request: Request) -> GPT4VOriginalityRequestResult:
"""
Compute the originality score of a pair of given text and image using
OpenAI GPT models.
Returns a value from 1 to 5.
"""
# We currently only support GPT4V evaluation.
assert self._is_gpt4v_model_engine(
request.model_engine
), f"Expect the model to be the GPT4V model engine, but got {request.model_engine}."
return self._make_scoring_request(request)
Loading
Loading