Refactor live_bench_2409.yaml and live_bench.yaml

EvolvingLMMs-Lab · Oct 7, 2024 · 052517c · 052517c
1 parent 9a20b96
commit 052517c
Show file tree

Hide file tree

Showing 20 changed files with 1,059 additions and 78 deletions.
diff --git a/lmms_eval/tasks/live_bench/live_bench.yaml b/lmms_eval/tasks/live_bench/live_bench.yaml
@@ -4,5 +4,5 @@ task:
 - live_bench_2407
 
 metadata:
-  api_type : openai
+  api_type: azure
   eval_with_mini: false
diff --git a/lmms_eval/tasks/live_bench/live_bench_2409.yaml b/lmms_eval/tasks/live_bench/live_bench_2409.yaml
@@ -0,0 +1,3 @@
+task: "live_bench_2409"
+dataset_name: 2024-09
+include: live_bench_template_yaml_v2
diff --git a/lmms_eval/tasks/live_bench/live_bench_template_yaml_v2 b/lmms_eval/tasks/live_bench/live_bench_template_yaml_v2
@@ -0,0 +1,29 @@
+dataset_path: lmms-lab/LiveBench
+dataset_kwargs:
+  token: True
+test_split: test
+dataset_name: 2024-07
+output_type: generate_until
+doc_to_visual: !function utils_v2.livebench_doc_to_visual
+doc_to_text: !function utils_v2.livebench_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils_v2.livebench_process_results
+process_results_use_image: true
+metric_list:
+  - metric: gpt4_eval_score
+    aggregation: !function utils_v2.livebench_aggregate_results
+    higher_is_better: true
+  # - metric: gpt4_eval_score_mini
+  #   aggregation: !function utils.livebench_aggregate_results
+  #   higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
diff --git a/lmms_eval/tasks/live_bench/utils_v2.py b/lmms_eval/tasks/live_bench/utils_v2.py
@@ -0,0 +1,216 @@
+import base64
+import json
+import logging
+import os
+import time
+from io import BytesIO
+from pathlib import Path
+
+import numpy as np
+import openai
+import pandas as pd
+import requests
+import yaml
+from tqdm import tqdm
+
+eval_logger = logging.getLogger("lmms-eval")
+
+
+with open(Path(__file__).parent / "live_bench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+API_TYPE = config["metadata"]["api_type"]
+EVAL_WITH_MINI = config["metadata"]["eval_with_mini"]
+
+
+def get_openai_client(api_version="2024-02-15-preview") -> openai.OpenAI:
+    endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+    if endpoint:
+        key = os.getenv("AZURE_OPENAI_API_KEY")
+        if not key:
+            raise ValueError("OPENAI_API_KEY environment variable not set.")
+        return openai.AzureOpenAI(azure_endpoint=endpoint, api_key=key, api_version=api_version)
+    else:
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY environment variable not set.")
+        return openai.OpenAI(api_key=api_key)
+
+
+client = get_openai_client()
+
+_PROMPT_WITH_IMAGE = """\
+[Question]
+
+{prompt}
+
+[Assistant Response]
+
+{generation}
+
+[Ground Truth Response]
+
+{reference}
+
+[System]
+
+Rate whether the assistant response correctly matches the ground truth, in regards to the image above.
+
+The rating should be 0-10, where 0 is incorrect and 10 is correct.
+
+Below is the specific criteria for rating:
+
+{criteria}
+
+Total score is out of 10. If the model's answer cannot be provided due to political reasons, please assign a score of 0 for further processing. If the model's response is biased due to political factors, please score it based on its understanding of the image, but reduce the objectivity score accordingly.
+
+Your response should be in the JSON format:
+```json
+{{
+    "Explanation": "(your explanation)",
+    "Rating": "(int)"
+}}
+```
+"""
+
+
+def format_prompt(question, ground_truth_answer, answer, criteria):
+    return _PROMPT_WITH_IMAGE.format(prompt=question, generation=answer, reference=ground_truth_answer, criteria=criteria)
+
+
+def get_chat_response(gpt_model_name, base64_images, question, ground_truth_answer, answer, criteria, max_retries=5, wait_time=10):
+    # client = openai.OpenAI(api_key=API_KEY)
+
+    content = []
+    for base64_image in base64_images:
+        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
+    prompt = format_prompt(question, ground_truth_answer, answer, criteria)
+    content.append(
+        {
+            "type": "text",
+            "text": prompt,
+        }
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": content,
+        }
+    ]
+
+    # payload = {
+    #     "model": GPT_EVAL_MODEL_NAME,
+    #     "response_format": {"type": "json_object"},
+    #     "max_tokens": 1024,
+    #     "temperature": 0.0,
+    # }
+
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(model=gpt_model_name, messages=messages, max_tokens=1024, response_format={"type": "json_object"}, temperature=0.0)
+            response_data = response.choices[0].message.content
+            # print(response_data)
+            response_data = json.loads(response_data)
+            rating = response_data["Rating"]
+            explanation = response_data["Explanation"]
+            return rating, explanation, gpt_model_name
+        except requests.exceptions.RequestException as e:
+            eval_logger.warning(f"Request failed on attempt {attempt + 1}: {e}")
+            time.sleep(wait_time)
+            if attempt == max_retries - 1:
+                eval_logger.error(f"Failed to get response after {max_retries} attempts")
+                return -1, str(e), gpt_model_name
+        except Exception as e:
+            eval_logger.error(f"Error on attempt {attempt + 1}: {e}")
+            return -1, str(e), gpt_model_name
+
+
+def image_to_base64(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+_images = {}
+
+dataset = None
+
+
+def livebench_doc_to_visual(doc):
+    img_list = [image.convert("RGB") for image in doc["images"]]
+    return img_list
+
+
+def livebench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{doc['question']}{post_prompt}"
+
+
+SUBTASKS = ["Basic Understanding", "Analytical Questions", "Divergent Thinking", "Real-world Assistance"]
+
+
+def livebench_process_results_for_name(doc, results, model, eval_name):
+    base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)]
+    subtask = doc["subtask"]
+    criteria = doc["criteria"]
+    if subtask not in SUBTASKS:
+        subtask = "further insights"
+    if not results or results[0] == "":
+        return {eval_name: {"rating": 0, "explanation": "No response", "model_name": "N/A", "subtask": subtask}}
+    rating, explanation, model_name = get_chat_response(gpt_model_name=model, base64_images=base64_images, question=doc["question"], ground_truth_answer=doc["answer"], answer=results[0] if results else "", criteria=criteria)
+    if rating >= 0:
+        return {eval_name: {"rating": rating, "explanation": explanation, "model_name": model_name, "subtask": subtask, "id": doc["id"]}}
+    else:
+        return {eval_name: {"rating": -1, "explanation": explanation, "model_name": "N/A", "subtask": subtask, "id": doc["id"]}}
+
+
+def livebench_process_results_4o(doc, results):
+    return livebench_process_results_for_name(doc, results, "gpt-4o", "gpt4_eval_score")
+
+
+def livebench_process_results_4o_mini(doc, results):
+    return livebench_process_results_for_name(doc, results, "gpt-4o-mini", "gpt4_eval_score_mini")
+
+
+def livebench_process_results(doc, results):
+    res = livebench_process_results_4o(doc, results)
+    if EVAL_WITH_MINI:
+        res.update(livebench_process_results_4o_mini(doc, results))
+    return res
+
+
+def livebench_aggregate_results(results):
+    sum_score, count = 0, 0
+    score = {}
+    for subtask in SUBTASKS:
+        score[subtask] = []
+    for result in results:
+        if result["rating"] == -1:
+            continue
+        sum_score += result["rating"] / 10
+        count += 1
+        subtask = result["subtask"]
+        if subtask not in SUBTASKS:
+            subtask = "OTHER_SUBTASK"
+        score[result["subtask"]].append(result["rating"] / 10)
+    res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS]
+    res.append(("Total", count, sum_score / count * 100))
+    # print("count:", count)
+    res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"])
+    print("=" * 50)
+    print(res)
+    print("=" * 50)
+    if count == 0:
+        eval_logger.warning("No valid scores to aggregate")
+    return sum_score / count * 100 if count > 0 else None
diff --git a/tools/live_bench/create_dataset.py b/tools/live_bench/create_dataset.py
@@ -3,8 +3,8 @@
 
 if __name__ == "__main__":
     website = load_websites()
-    dataset = LiveBench(name="2024-09", force_clear=True)
+    dataset = LiveBench(name="2024-09")
 
-    website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images")[:2]
-    dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
+    website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images/selected")
+    dataset.capture(websites=website, screen_shoter="human", qa_generator="claude", scorer="claude", checker="gpt4v", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
     dataset.upload()
diff --git a/tools/live_bench/example.ipynb b/tools/live_bench/example.ipynb
@@ -40,7 +40,7 @@
     }
    ],
    "source": [
-    "from live_bench.data_generator.utils.extract_infomation import InfomationExtractor\n",
+    "from live_bench.data_generator.utils.extract_information import InfomationExtractor\n",
     "from live_bench.screen_shoter import get_shoter\n",
     "from live_bench.driver import load_driver\n",
     "\n",
@@ -71,7 +71,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "response = extractor.extract_infomation(w)"
+    "response = extractor.extract_information(w)"
    ]
   },
   {

diff --git a/tools/live_bench/live_bench/data_generator/check_prompt.md b/tools/live_bench/live_bench/data_generator/check_prompt.md
@@ -8,6 +8,7 @@ Note that the subtask must be one of these four:
 - Analytical Questions
 - Evaluative Questions
 - Divergent Thinking
+- Real-world Assistance
 
 If you think the question does not correspond to the subtask, you have two options:
 1. Modify the question to correspond to the subtask.

diff --git a/tools/live_bench/live_bench/data_generator/example/example_output.json b/tools/live_bench/live_bench/data_generator/example/example_output.json
@@ -26,15 +26,17 @@
         }
     ],
     "Divergent Thinking": [
-        {
-            "Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?",
-            "Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.",
-            "Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices."
-        },
         {
             "Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?",
             "Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.",
             "Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations."
         }
+    ],
+    "Real-world Assistance": [
+        {
+            "Question": "Please present this news in Arabic and output it in markdown format.",
+            "Answer": "Here is a translation of the key headlines from the provided news page into Arabic in markdown format:\n\n```markdown\n## \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u064a\u062f\u0639\u0645 \u062e\u0637\u0629 \u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0628\u064a\u0646 \u0625\u0633\u0631\u0627\u0626\u064a\u0644 \u0648\u063a\u0632\u0629 \u0628\u0648\u0633\u0627\u0637\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629\n\n- \u0627\u0642\u062a\u0631\u062d \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u0627\u0644\u062a\u0627\u0628\u0639 \u0644\u0644\u0623\u0645\u0645 \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0634\u0631\u0648\u0637\u0627\u064b \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0627\u0644\u0643\u0627\u0645\u0644 \u0648\u0625\u0637\u0644\u0627\u0642 \u0633\u0631\u0627\u062d \u0627\u0644\u0631\u0647\u0627\u0626\u0646 \u0627\u0644\u0630\u064a\u0646 \u062a\u062d\u062a\u062c\u0632\u0647\u0645 \u062d\u0645\u0627\u0633.\n\n---\n\n### \u0646\u062a\u0646\u064a\u0627\u0647\u0648 \u064a\u0633\u064a\u0631 \u0639\u0644\u0649 \u062d\u0628\u0644 \u0645\u0634\u062f\u0648\u062f \u0645\u0639 \u062f\u0639\u0648\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0641\u064a \u063a\u0632\u0629\n\n- \u064a\u0642\u0648\u0644 \u062c\u064a\u0631\u064a\u0645\u064a \u0628\u0648\u064a\u0646\u060c \u0645\u0631\u0627\u0633\u0644 \u0628\u064a \u0628\u064a \u0633\u064a \u0644\u0634\u0624\u0648\u0646 \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637\u060c \u0625\u0646 \u0645\u0647\u0645\u0629 \u0648\u0632\u064a\u0631 \u0627\u0644\u062e\u0627\u0631\u062c\u064a\u0629 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0628\u0644\u064a\u0646\u0643\u0646 \u0641\u064a \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637 \u062a\u0635\u0637\u062f\u0645 \u0628\u0627\u0644\u0633\u064a\u0627\u0633\u0629 \u0627\u0644\u0625\u0633\u0631\u0627\u0626\u064a\u0644\u064a\u0629.\n\n---\n\n### \u0637\u0639\u0646 \u0623\u0631\u0628\u0639\u0629 \u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0623\u0645\u0631\u064a\u0643\u064a\u064a\u0646 \u0641\u064a \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0641\u064a \u0627\u0644\u0635\u064a\u0646\n\n- \u0642\u0627\u0644\u062a \u0643\u0644\u064a\u0629 \u0643\u0648\u0631\u0646\u064a\u0644 \u0625\u0646 \u0627\u0644\u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0643\u0627\u0646\u0648\u0627 \u0641\u064a \u0632\u064a\u0627\u0631\u0629 \u0646\u0647\u0627\u0631\u064a\u0629 \u0625\u0644\u0649 \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0639\u0646\u062f\u0645\u0627 \u062a\u0639\u0631\u0636\u0648\u0627 \u0644\u0644\u0647\u062c\u0648\u0645.\n\n---\n\n### \u0634\u0631\u0643\u0629 \u0623\u0628\u0644 \u062a\u062c\u0644\u0628 ChatGPT \u0625\u0644\u0649 \u0623\u062c\u0647\u0632\u0629 iPhone \u0641\u064a \u062a\u062d\u062f\u064a\u062b \u0634\u0627\u0645\u0644\n\n- \u0643\u0627\u0646\u062a \u0627\u0644\u0634\u0631\u0643\u0629 \u0623\u0628\u0637\u0623 \u0641\u064a \u0637\u0631\u062d \u0645\u064a\u0632\u0627\u062a \u0627\u0644\u0630\u0643\u0627\u0621 \u0627\u0644\u0627\u0635\u0637\u0646\u0627\u0639\u064a \u0627\u0644\u062a\u0648\u0644\u064a\u062f\u064a \u0645\u0642\u0627\u0631\u0646\u0629 \u0628\u0645\u0646\u0627\u0641\u0633\u064a\u0647\u0627 \u0645\u062b\u0644 \u062c\u0648\u062c\u0644 \u0648\u0645\u0627\u064a\u0643\u0631\u0648\u0633\u0648\u0641\u062a.\n\n---\n\n### \u0637\u0627\u0626\u0631\u0629 \u062a\u0642\u0644 \u0646\u0627\u0626\u0628 \u0631\u0626\u064a\u0633 \u0645\u0644\u0627\u0648\u064a \u062a\u062e\u062a\u0641\u064a\n\n- \u0643\u0627\u0646 \u0633\u0627\u0648\u0644\u0648\u0633 \u062a\u0634\u064a\u0644\u0645\u0627 \u0648\u062a\u0633\u0639\u0629 \u0622\u062e\u0631\u0648\u0646 \u0639\u0644\u0649 \u0645\u062a\u0646 \u0637\u0627\u0626\u0631\u0629 \u0639\u0633\u0643\u0631\u064a\u0629 \u0627\u062e\u062a\u0641\u062a \u0639\u0646 \u0627\u0644\u0631\u0627\u062f\u0627\u0631 \u0635\u0628\u0627\u062d \u0627\u0644\u0627\u062b\u0646\u064a\u0646.\n\n---\n\n### \u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u0628\u0627\u0646\u0643\u0648\u0643 \u064a\u0642\u062a\u0644 1,000 \u062d\u064a\u0648\u0627\u0646\n\n- \u0627\u0644\u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u062a\u0627\u064a\u0644\u0627\u0646\u062f \u0623\u0633\u0641\u0631 \u0639\u0646 \u0645\u0642\u062a\u0644 \u0627\u0644\u0643\u0644\u0627\u0628 \u0648\u0627\u0644\u0637\u064a\u0648\u0631 \u0648\u0627\u0644\u062b\u0639\u0627\u0628\u064a\u0646 \u0648\u0627\u0644\u0642\u0637\u0637.\n\n---\n\n### \u0642\u0646\u0627\u0629 \u0634\u062d\u0646 \u0628\u0627\u0644\u062a\u064a\u0645\u0648\u0631 \u062a\u0639\u064a\u062f \u0641\u062a\u062d\u0647\u0627 \u0628\u0639\u062f \u0627\u0646\u0647\u064a\u0627\u0631 \u062c\u0633\u0631\n\n- \u0642\u0627\u0644\u062a \u0642\u0648\u0627\u062a \u0627\u0644\u0645\u0647\u0646\u062f\u0633\u064a\u0646 \u0627\u0644\u062a\u0627\u0628\u0639\u0629 \u0644\u0644\u062c\u064a\u0634 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0625\u0646 \u0627\u0644\u0645\u0646\u0637\u0642\u0629 \u0623\u0635\u0628\u062d\u062a \"\u0622\u0645\u0646\u0629 \u0644\u0644\u0645\u0644\u0627\u062d\u0629\" \u0628\u0639\u062f \u062d\u0648\u0627\u0644\u064a \u062b\u0644\u0627\u062b\u0629 \u0623\u0634\u0647\u0631 \u0645\u0646 \u0627\u0644\u0643\u0627\u0631\u062b\u0629.\n``` \n\nThis Arabic summary captures the main stories shown on the BBC news homepage in your image.",
+            "Criteria": "There are seven titles in total, with a total of 10 points. 1 point will be deducted for a title translation error. 3 points will be deducted for markdown format errors."
+        }
     ]
 }