stanford-crfm · teetone · May 23, 2024 · Apr 18, 2024 · Apr 29, 2024 · May 10, 2024
diff --git a/src/helm/benchmark/metrics/common_metric_specs.py b/src/helm/benchmark/metrics/common_metric_specs.py
@@ -165,3 +165,9 @@ def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricS
 
 def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
+
+
+def get_gpt4v_originality_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.metrics.gpt4v_originality_metrics.GPT4VOriginalityMetric", args={}),
+    ]
diff --git a/src/helm/benchmark/metrics/gpt4v_originality_metrics.py b/src/helm/benchmark/metrics/gpt4v_originality_metrics.py
@@ -0,0 +1,78 @@
+from typing import List
+
+from helm.common.gpt4v_originality_request import (
+    GPT4VOriginalityRequestResult,
+)
+from helm.common.request import Request, RequestResult, GeneratedOutput
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.common.media_object import MultimediaObject, MediaObject, IMAGE_TYPE
+from .metric import Metric
+from .metric_name import MetricName
+from .metric_service import MetricService
+from .statistic import Stat
+
+
+class GPT4VOriginalityMetric(Metric):
+    """
+    Defines metrics for the originality evaluation based on GPT4V.
+    """
+
+    GPT4V_ORIGINALITY_MODEL_NAME: str = "openai/gpt-4-vision-preview"
+
+    def __init__(self):
+        super().__init__()
+
+    def __repr__(self):
+        return "GPT4VOriginalityMetric()"
+
+    def _make_evaluation_content_from_multimedia(
+        self, input_media: MultimediaObject, input_text: str
+    ) -> MultimediaObject:
+        """
+        Seperates the image from the multimedia object and returns a new multimedia object with
+        the image and the given text.
+        """
+        image_object = [item for item in input_media.media_objects if item.is_type(IMAGE_TYPE) and item.location]
+        text_object = MediaObject(text=input_text, content_type="text/plain")
+        return MultimediaObject(media_objects=[image_object[0], text_object])
+
+    def evaluate_generation(
+        self,
+        adapter_spec: AdapterSpec,
+        request_state: RequestState,
+        metric_service: MetricService,
+        eval_cache_path: str,
+    ) -> List[Stat]:
+        """
+        Given the proper prompt, we compute the orignality scores of VLM generated content
+        given input image and the generated text.
+        """
+        request: Request = request_state.request
+        # Predicted outputs and their originality scores
+        assert request_state.result is not None
+        request_result: RequestResult = request_state.result
+        # Get input image and generated response for the originality evaluation
+        assert request.multimodal_prompt is not None
+        input_media: MultimediaObject = request.multimodal_prompt
+        completions: List[GeneratedOutput] = request_result.completions
+
+        input_text: str = completions[0].text
+        evaluation_media: MultimediaObject = self._make_evaluation_content_from_multimedia(input_media, input_text)
+        response: GPT4VOriginalityRequestResult = metric_service.get_gpt4v_originality_scores(
+            request=Request(model=self.GPT4V_ORIGINALITY_MODEL_NAME, multimodal_prompt=evaluation_media)
+        )
+        if not response.success:
+            raise Exception(f"Failed to get GPT4V originality scores: {response}")
+
+        # Extract the originality scores from the response
+        originality_scores: List[float] = [output.score for output in response.scores]
+        num_originality_completions: int = len(originality_scores)
+
+        max_originality_score: float = max(originality_scores) if len(originality_scores) > 0 else 0
+        stats: List[Stat] = [
+            Stat(MetricName("expected_max_originality")).add(max_originality_score),
+            Stat(MetricName("originality_score")).add(sum(originality_scores) / num_originality_completions),
+        ]
+
+        return stats
diff --git a/src/helm/benchmark/metrics/metric_service.py b/src/helm/benchmark/metrics/metric_service.py
@@ -6,6 +6,8 @@
 from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
 from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
 from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
+from helm.common.gpt4v_originality_request import GPT4VOriginalityRequestResult
+from helm.common.request import Request
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
 from helm.proxy.services.service import Service
 from helm.common.cache import Cache
@@ -31,6 +33,9 @@ def upload(self, request: FileUploadRequest) -> FileUploadResult:
     def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
         return self._service.get_toxicity_scores(self._auth, request)
 
+    def get_gpt4v_originality_scores(self, request: Request) -> GPT4VOriginalityRequestResult:
+        return self._service.get_gpt4v_originality_scores(self._auth, request)
+
     def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
         return self._service.make_critique_request(self._auth, request)
 

diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -11,6 +11,7 @@
     get_basic_reference_metric_specs,
     get_exact_match_metric_specs,
     get_open_ended_generation_metric_specs,
+    get_gpt4v_originality_metric_specs,
 )
 from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
@@ -56,6 +57,13 @@ def get_short_answer_generation_adapter_spec():
     )
 
 
+def get_open_end_answer_generation_adapter_spec():
+    return get_generation_adapter_spec(
+        instructions="Follow the given instruction and give your complete answer.",
+        max_tokens=500,
+    )
+
+
 def get_multiple_choice_joint_adapter_spec(
     input_noun: Optional[str],
     output_noun: str,
@@ -488,8 +496,8 @@ def get_mementos_spec(subject: str) -> RunSpec:
         class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
         args={"subject": subject},
     )
-    adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
-    metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
+    adapter_spec: AdapterSpec = get_open_end_answer_generation_adapter_spec()
+    metric_specs: List[MetricSpec] = get_gpt4v_originality_metric_specs()
 
     run_spec_name: str = "mementos"
     return RunSpec(

diff --git a/src/helm/benchmark/scenarios/vision_language/mementos_scenario.py b/src/helm/benchmark/scenarios/vision_language/mementos_scenario.py
@@ -51,6 +51,8 @@ class MementosScenario(Scenario):
         "Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
     )
 
+    ORIGINALITY_QUESTION_PROMPT: str = "Write a creative and original story for the given image sequence."
+
     SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]
 
     name = "mementos"
@@ -98,7 +100,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
 
                 content: List[MediaObject] = [
                     MediaObject(location=local_image_path, content_type="image/png"),
-                    MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
+                    MediaObject(text=self.ORIGINALITY_QUESTION_PROMPT, content_type="text/plain"),
                 ]
                 answer: str = row["description"]
                 instances.append(

diff --git a/src/helm/clients/auto_client.py b/src/helm/clients/auto_client.py
@@ -16,6 +16,7 @@
 from helm.clients.moderation_api_client import ModerationAPIClient
 from helm.proxy.critique.critique_client import CritiqueClient
 from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
+from helm.clients.gpt4v_originality_client import GPT4VOriginalityClient
 from helm.proxy.retry import NonRetriableException, retry_request
 from helm.tokenizers.auto_tokenizer import AutoTokenizer
 
@@ -151,6 +152,28 @@ def get_toxicity_classifier_client(self) -> ToxicityClassifierClient:
         cache_config: CacheConfig = self.cache_backend_config.get_cache_config("perspectiveapi")
         return PerspectiveAPIClient(self.credentials.get("perspectiveApiKey", ""), cache_config)
 
+    def get_gpt4v_originality_client(self) -> GPT4VOriginalityClient:
+        """Get the GPT4V originality client for image and text input."""
+
+        # We only support GPT4V for evaluating the originality scores of VLMs now.
+        originality_model_deployment_name = "openai/gpt-4-vision-preview"
+
+        model_deployment: ModelDeployment = get_model_deployment(originality_model_deployment_name)
+        host_organization: str = model_deployment.host_organization
+        cache_config: CacheConfig = self.cache_backend_config.get_cache_config(host_organization)
+
+        client_params = {
+            "cache_config": cache_config,
+            "tokenizer_name": model_deployment.tokenizer_name,
+            "api_key": provide_api_key(self.credentials, host_organization, originality_model_deployment_name),
+            "tokenizer": self._auto_tokenizer._get_tokenizer(
+                tokenizer_name=model_deployment.tokenizer_name or model_deployment.name
+            ),
+            "org_id": self.credentials.get(host_organization + "OrgId", None),  # OpenAI, GooseAI, Microsoft
+        }
+
+        return GPT4VOriginalityClient(**client_params)
+
     def get_moderation_api_client(self) -> ModerationAPIClient:
         """Get the ModerationAPI client."""
         cache_config: CacheConfig = self.cache_backend_config.get_cache_config("ModerationAPI")

diff --git a/src/helm/clients/gpt4v_originality_client.py b/src/helm/clients/gpt4v_originality_client.py
@@ -0,0 +1,189 @@
+# mypy: check_untyped_defs = False
+from re import search, Match
+from typing import Any, Dict, List, Optional, cast, Union
+
+from helm.benchmark.model_metadata_registry import is_vlm
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import GeneratedOutput, Token, Request
+from helm.common.hierarchical_logger import hlog
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.gpt4v_originality_request import (
+    wrap_request_time,
+    GPT4VOriginalityRequestResult,
+    GPT4VScoreOutput,
+)
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationRequestResult,
+)
+from .client import truncate_sequence
+from .openai_client import OpenAIClient
+
+try:
+    import openai
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["openai"])
+
+
+class GPT4VOriginalityClient(OpenAIClient):
+    # TODO: Design a more structured prompt for evaluation. We need to explain each level of the
+    # TODO: originality score in one short sentence.
+    EVALUATION_PROMPT_TEMPLATE = (
+        "Please rate the generated texts given the image from 1 to 5. Try your best "
+        "to give only the rating without other explanations.\n{GENERATED_TEXT}"
+    )
+    REX_PATTERN = r"[1-5]"
+
+    def _is_gpt4v_model_engine(self, model_engine: str) -> bool:
+        if model_engine.startswith("gpt-4-vision"):
+            return True
+        return False
+
+    def _reformat_text_request(self, original_text_input: str) -> str:
+        return self.EVALUATION_PROMPT_TEMPLATE.format(GENERATED_TEXT=original_text_input)
+
+    def _convert_completion_to_originality_score(self, completions: GeneratedOutput) -> GPT4VScoreOutput:
+        # TODO: We might consider improving the extraction process of GPT4V generated score here.
+        new_text: str = completions.text
+        match_seq: Optional[Match[str]] = search(self.REX_PATTERN, new_text)
+        if match_seq:
+            new_score: float = float(match_seq.group())
+        else:
+            raise ValueError(f"Could not find a score in the completion text: {new_text}")
+        gpt4vscore = GPT4VScoreOutput(score=new_score, logprob=completions.logprob, tokens=completions.tokens)
+        return gpt4vscore
+
+    def _make_scoring_request(self, request: Request) -> GPT4VOriginalityRequestResult:
+        messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
+        # Only support multimodal_prompt as the input for now
+        # Convert prompt into a single message
+        # For now, put the whole prompt in a single user message, and expect the response
+        # to be returned in a single assistant message.
+        # TODO: Support ChatML for creating multiple messages with different roles.
+        # See: https://github.com/openai/openai-python/blob/main/chatml.md
+
+        # Content can either be text or a list of multimodal content made up of text and images:
+        # https://platform.openai.com/docs/guides/vision
+        content: Union[str, List[Union[str, Any]]]
+        if request.multimodal_prompt is not None:
+            content = []
+            for media_object in request.multimodal_prompt.media_objects:
+                if media_object.is_type("image") and media_object.location:
+                    from helm.common.images_utils import encode_base64
+
+                    base64_image: str = encode_base64(media_object.location)
+                    content.append(
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                    )
+                elif media_object.is_type(TEXT_TYPE):
+                    if media_object.text is None:
+                        raise ValueError("MediaObject of text type has missing text field value")
+                    text_input: str = self._reformat_text_request(media_object.text)
+                    content.append({"type": media_object.type, "text": text_input})
+                else:
+                    raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+
+        else:
+            raise ValueError("Input request has missing multimodal prompt value")
+
+        messages = [{"role": "user", "content": content}]
+
+        # Fixing the most generation parameters here.
+        raw_request: Dict[str, Any] = {
+            "model": self._get_model_for_request(request),
+            "messages": messages,
+            "temperature": 1.0,
+            "top_p": 1.0,
+            "n": 1,
+            "stop": None,
+            # Note: Chat models may require adding an extra token to max_tokens
+            # for the internal special role token.
+            "max_tokens": 15,
+            "presence_penalty": 0.0,
+            "frequency_penalty": 0.0,
+        }
+
+        # OpenAI's vision API doesn't allow None values for stop.
+        # Fails with "body -> stop: none is not an allowed value" if None is passed.
+        if is_vlm(request.model) and raw_request["stop"] is None:
+            raw_request.pop("stop")
+
+        def do_it() -> Dict[str, Any]:
+            return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
+
+        try:
+            cache_key = self._get_cache_key(raw_request, request)
+            response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+        except openai.OpenAIError as e:
+            if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
+                hlog(f"Failed safety check: {str(request)}")
+                empty_completion = GeneratedOutput(
+                    text="",
+                    logprob=0,
+                    tokens=[],
+                    finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
+                )
+                empty_score = GPT4VScoreOutput(
+                    score=0.0,
+                    logprob=0,
+                    tokens=[],
+                    finish_reason={"reason": self.CONTENT_POLICY_VIOLATED_FINISH_REASON},
+                )
+                return GPT4VOriginalityRequestResult(
+                    success=True,
+                    cached=False,
+                    request_time=0,
+                    completions=[empty_completion] * request.num_completions,
+                    scores=[empty_score] * request.num_completions,
+                )
+
+            error: str = f"OpenAI error: {e}"
+            return GPT4VOriginalityRequestResult(success=False, cached=False, error=error, completions=[], scores=[])
+
+        scores: List[GPT4VScoreOutput] = []
+        completions: List[GeneratedOutput] = []
+        for raw_completion in response["choices"]:
+            # The OpenAI chat completion API doesn't support echo.
+            # If `echo_prompt` is true, combine the prompt and completion.
+            raw_completion_content = raw_completion["message"]["content"]
+            text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
+            # The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
+            tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
+                TokenizationRequest(text, tokenizer=self.tokenizer_name)
+            )
+            # Log probs are not currently not supported by the OpenAI chat completion API, so set to 0 for now.
+            tokens: List[Token] = [
+                Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
+            ]
+            completion = GeneratedOutput(
+                text=text,
+                logprob=0,  # OpenAI does not provide logprobs
+                tokens=tokens,
+                finish_reason={"reason": raw_completion["finish_reason"]},
+            )
+            # Truncate the text by stop sequences
+            truncated_completion: GeneratedOutput = truncate_sequence(completion, request)
+            completions.append(truncated_completion)
+            # Convert the completion to originality score output
+            scores.append(self._convert_completion_to_originality_score(truncated_completion))
+
+        return GPT4VOriginalityRequestResult(
+            success=True,
+            cached=cached,
+            request_time=response["request_time"],
+            request_datetime=response.get("request_datetime"),
+            scores=scores,
+            completions=completions,
+        )
+
+    def get_originality_scores(self, request: Request) -> GPT4VOriginalityRequestResult:
+        """
+        Compute the originality score of a pair of given text and image using
+        OpenAI GPT models.
+        Returns a value from 1 to 5.
+        """
+        # We currently only support GPT4V evaluation.
+        assert self._is_gpt4v_model_engine(
+            request.model_engine
+        ), f"Expect the model to be the GPT4V model engine, but got {request.model_engine}."
+        return self._make_scoring_request(request)