Skip to content

Commit

Permalink
Refactor live_bench_2409.yaml and live_bench.yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
pufanyi committed Oct 7, 2024
1 parent 9a20b96 commit 052517c
Show file tree
Hide file tree
Showing 20 changed files with 1,059 additions and 78 deletions.
2 changes: 1 addition & 1 deletion lmms_eval/tasks/live_bench/live_bench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ task:
- live_bench_2407

metadata:
api_type : openai
api_type: azure
eval_with_mini: false
3 changes: 3 additions & 0 deletions lmms_eval/tasks/live_bench/live_bench_2409.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
task: "live_bench_2409"
dataset_name: 2024-09
include: live_bench_template_yaml_v2
29 changes: 29 additions & 0 deletions lmms_eval/tasks/live_bench/live_bench_template_yaml_v2
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
dataset_path: lmms-lab/LiveBench
dataset_kwargs:
token: True
test_split: test
dataset_name: 2024-07
output_type: generate_until
doc_to_visual: !function utils_v2.livebench_doc_to_visual
doc_to_text: !function utils_v2.livebench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 1024
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
process_results: !function utils_v2.livebench_process_results
process_results_use_image: true
metric_list:
- metric: gpt4_eval_score
aggregation: !function utils_v2.livebench_aggregate_results
higher_is_better: true
# - metric: gpt4_eval_score_mini
# aggregation: !function utils.livebench_aggregate_results
# higher_is_better: true

lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
216 changes: 216 additions & 0 deletions lmms_eval/tasks/live_bench/utils_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import base64
import json
import logging
import os
import time
from io import BytesIO
from pathlib import Path

import numpy as np
import openai
import pandas as pd
import requests
import yaml
from tqdm import tqdm

eval_logger = logging.getLogger("lmms-eval")


with open(Path(__file__).parent / "live_bench.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)

config = yaml.safe_load("".join(safe_data))

API_TYPE = config["metadata"]["api_type"]
EVAL_WITH_MINI = config["metadata"]["eval_with_mini"]


def get_openai_client(api_version="2024-02-15-preview") -> openai.OpenAI:
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
if endpoint:
key = os.getenv("AZURE_OPENAI_API_KEY")
if not key:
raise ValueError("OPENAI_API_KEY environment variable not set.")
return openai.AzureOpenAI(azure_endpoint=endpoint, api_key=key, api_version=api_version)
else:
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set.")
return openai.OpenAI(api_key=api_key)


client = get_openai_client()

_PROMPT_WITH_IMAGE = """\
[Question]
{prompt}
[Assistant Response]
{generation}
[Ground Truth Response]
{reference}
[System]
Rate whether the assistant response correctly matches the ground truth, in regards to the image above.
The rating should be 0-10, where 0 is incorrect and 10 is correct.
Below is the specific criteria for rating:
{criteria}
Total score is out of 10. If the model's answer cannot be provided due to political reasons, please assign a score of 0 for further processing. If the model's response is biased due to political factors, please score it based on its understanding of the image, but reduce the objectivity score accordingly.
Your response should be in the JSON format:
```json
{{
"Explanation": "(your explanation)",
"Rating": "(int)"
}}
```
"""


def format_prompt(question, ground_truth_answer, answer, criteria):
return _PROMPT_WITH_IMAGE.format(prompt=question, generation=answer, reference=ground_truth_answer, criteria=criteria)


def get_chat_response(gpt_model_name, base64_images, question, ground_truth_answer, answer, criteria, max_retries=5, wait_time=10):
# client = openai.OpenAI(api_key=API_KEY)

content = []
for base64_image in base64_images:
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
prompt = format_prompt(question, ground_truth_answer, answer, criteria)
content.append(
{
"type": "text",
"text": prompt,
}
)

messages = [
{
"role": "user",
"content": content,
}
]

# payload = {
# "model": GPT_EVAL_MODEL_NAME,
# "response_format": {"type": "json_object"},
# "max_tokens": 1024,
# "temperature": 0.0,
# }

for attempt in range(max_retries):
try:
response = client.chat.completions.create(model=gpt_model_name, messages=messages, max_tokens=1024, response_format={"type": "json_object"}, temperature=0.0)
response_data = response.choices[0].message.content
# print(response_data)
response_data = json.loads(response_data)
rating = response_data["Rating"]
explanation = response_data["Explanation"]
return rating, explanation, gpt_model_name
except requests.exceptions.RequestException as e:
eval_logger.warning(f"Request failed on attempt {attempt + 1}: {e}")
time.sleep(wait_time)
if attempt == max_retries - 1:
eval_logger.error(f"Failed to get response after {max_retries} attempts")
return -1, str(e), gpt_model_name
except Exception as e:
eval_logger.error(f"Error on attempt {attempt + 1}: {e}")
return -1, str(e), gpt_model_name


def image_to_base64(pil_image):
buffered = BytesIO()
pil_image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")


_images = {}

dataset = None


def livebench_doc_to_visual(doc):
img_list = [image.convert("RGB") for image in doc["images"]]
return img_list


def livebench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
return f"{pre_prompt}{doc['question']}{post_prompt}"


SUBTASKS = ["Basic Understanding", "Analytical Questions", "Divergent Thinking", "Real-world Assistance"]


def livebench_process_results_for_name(doc, results, model, eval_name):
base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)]
subtask = doc["subtask"]
criteria = doc["criteria"]
if subtask not in SUBTASKS:
subtask = "further insights"
if not results or results[0] == "":
return {eval_name: {"rating": 0, "explanation": "No response", "model_name": "N/A", "subtask": subtask}}
rating, explanation, model_name = get_chat_response(gpt_model_name=model, base64_images=base64_images, question=doc["question"], ground_truth_answer=doc["answer"], answer=results[0] if results else "", criteria=criteria)
if rating >= 0:
return {eval_name: {"rating": rating, "explanation": explanation, "model_name": model_name, "subtask": subtask, "id": doc["id"]}}
else:
return {eval_name: {"rating": -1, "explanation": explanation, "model_name": "N/A", "subtask": subtask, "id": doc["id"]}}


def livebench_process_results_4o(doc, results):
return livebench_process_results_for_name(doc, results, "gpt-4o", "gpt4_eval_score")


def livebench_process_results_4o_mini(doc, results):
return livebench_process_results_for_name(doc, results, "gpt-4o-mini", "gpt4_eval_score_mini")


def livebench_process_results(doc, results):
res = livebench_process_results_4o(doc, results)
if EVAL_WITH_MINI:
res.update(livebench_process_results_4o_mini(doc, results))
return res


def livebench_aggregate_results(results):
sum_score, count = 0, 0
score = {}
for subtask in SUBTASKS:
score[subtask] = []
for result in results:
if result["rating"] == -1:
continue
sum_score += result["rating"] / 10
count += 1
subtask = result["subtask"]
if subtask not in SUBTASKS:
subtask = "OTHER_SUBTASK"
score[result["subtask"]].append(result["rating"] / 10)
res = [(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) for subtask in SUBTASKS]
res.append(("Total", count, sum_score / count * 100))
# print("count:", count)
res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"])
print("=" * 50)
print(res)
print("=" * 50)
if count == 0:
eval_logger.warning("No valid scores to aggregate")
return sum_score / count * 100 if count > 0 else None
6 changes: 3 additions & 3 deletions tools/live_bench/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

if __name__ == "__main__":
website = load_websites()
dataset = LiveBench(name="2024-09", force_clear=True)
dataset = LiveBench(name="2024-09")

website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images")[:2]
dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
website = load_websites_from_file("/data/pufanyi/project/lmms-eval/tools/temp/processed_images/selected")
dataset.capture(websites=website, screen_shoter="human", qa_generator="claude", scorer="claude", checker="gpt4v", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
dataset.upload()
4 changes: 2 additions & 2 deletions tools/live_bench/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
}
],
"source": [
"from live_bench.data_generator.utils.extract_infomation import InfomationExtractor\n",
"from live_bench.data_generator.utils.extract_information import InfomationExtractor\n",
"from live_bench.screen_shoter import get_shoter\n",
"from live_bench.driver import load_driver\n",
"\n",
Expand Down Expand Up @@ -71,7 +71,7 @@
"metadata": {},
"outputs": [],
"source": [
"response = extractor.extract_infomation(w)"
"response = extractor.extract_information(w)"
]
},
{
Expand Down
1 change: 1 addition & 0 deletions tools/live_bench/live_bench/data_generator/check_prompt.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Note that the subtask must be one of these four:
- Analytical Questions
- Evaluative Questions
- Divergent Thinking
- Real-world Assistance

If you think the question does not correspond to the subtask, you have two options:
1. Modify the question to correspond to the subtask.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,17 @@
}
],
"Divergent Thinking": [
{
"Question": "Considering the current global attention on AI, how might the article about Apple bringing ChatGPT to iPhones in an AI overhaul reflect on broader technological trends and consumer expectations?",
"Answer": "This article reflects broader trends in AI integration into consumer technology, highlighting competitive dynamics in the tech industry, and growing consumer expectations for sophisticated AI features in everyday devices.",
"Criteria": "Award up to 10 marks based on the depth and accuracy of the response: 3 marks for identifying AI integration into consumer technology, 3 marks for discussing competitive dynamics in the tech industry, 4 marks for explaining the growth in consumer expectations for sophisticated AI features in everyday devices."
},
{
"Question": "How does the image of a child in distress in a green field relate symbolically to the outcomes or themes of conflict depicted in the ceasefire article?",
"Answer": "The image symbolically represents the innocent casualties of conflict, particularly how children are affected, resonating with the urgency and necessity of a ceasefire to protect the most vulnerable populations from the consequences of prolonged conflict.",
"Criteria": "Award up to 10 marks based on the accuracy and relevance of the response: 2 marks for identifying the symbolic representation of innocent casualties, 3 marks for specifically mentioning how children are affected, 3 marks for relating this to the urgency and necessity of a ceasefire, 2 marks for connecting these elements to the protection of vulnerable populations."
}
],
"Real-world Assistance": [
{
"Question": "Please present this news in Arabic and output it in markdown format.",
"Answer": "Here is a translation of the key headlines from the provided news page into Arabic in markdown format:\n\n```markdown\n## \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u064a\u062f\u0639\u0645 \u062e\u0637\u0629 \u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0628\u064a\u0646 \u0625\u0633\u0631\u0627\u0626\u064a\u0644 \u0648\u063a\u0632\u0629 \u0628\u0648\u0633\u0627\u0637\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629\n\n- \u0627\u0642\u062a\u0631\u062d \u0645\u062c\u0644\u0633 \u0627\u0644\u0623\u0645\u0646 \u0627\u0644\u062a\u0627\u0628\u0639 \u0644\u0644\u0623\u0645\u0645 \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0634\u0631\u0648\u0637\u0627\u064b \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0627\u0644\u0643\u0627\u0645\u0644 \u0648\u0625\u0637\u0644\u0627\u0642 \u0633\u0631\u0627\u062d \u0627\u0644\u0631\u0647\u0627\u0626\u0646 \u0627\u0644\u0630\u064a\u0646 \u062a\u062d\u062a\u062c\u0632\u0647\u0645 \u062d\u0645\u0627\u0633.\n\n---\n\n### \u0646\u062a\u0646\u064a\u0627\u0647\u0648 \u064a\u0633\u064a\u0631 \u0639\u0644\u0649 \u062d\u0628\u0644 \u0645\u0634\u062f\u0648\u062f \u0645\u0639 \u062f\u0639\u0648\u0629 \u0627\u0644\u0648\u0644\u0627\u064a\u0627\u062a \u0627\u0644\u0645\u062a\u062d\u062f\u0629 \u0644\u0648\u0642\u0641 \u0625\u0637\u0644\u0627\u0642 \u0627\u0644\u0646\u0627\u0631 \u0641\u064a \u063a\u0632\u0629\n\n- \u064a\u0642\u0648\u0644 \u062c\u064a\u0631\u064a\u0645\u064a \u0628\u0648\u064a\u0646\u060c \u0645\u0631\u0627\u0633\u0644 \u0628\u064a \u0628\u064a \u0633\u064a \u0644\u0634\u0624\u0648\u0646 \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637\u060c \u0625\u0646 \u0645\u0647\u0645\u0629 \u0648\u0632\u064a\u0631 \u0627\u0644\u062e\u0627\u0631\u062c\u064a\u0629 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0628\u0644\u064a\u0646\u0643\u0646 \u0641\u064a \u0627\u0644\u0634\u0631\u0642 \u0627\u0644\u0623\u0648\u0633\u0637 \u062a\u0635\u0637\u062f\u0645 \u0628\u0627\u0644\u0633\u064a\u0627\u0633\u0629 \u0627\u0644\u0625\u0633\u0631\u0627\u0626\u064a\u0644\u064a\u0629.\n\n---\n\n### \u0637\u0639\u0646 \u0623\u0631\u0628\u0639\u0629 \u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0623\u0645\u0631\u064a\u0643\u064a\u064a\u0646 \u0641\u064a \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0641\u064a \u0627\u0644\u0635\u064a\u0646\n\n- \u0642\u0627\u0644\u062a \u0643\u0644\u064a\u0629 \u0643\u0648\u0631\u0646\u064a\u0644 \u0625\u0646 \u0627\u0644\u0645\u062f\u0631\u0651\u0633\u064a\u0646 \u0643\u0627\u0646\u0648\u0627 \u0641\u064a \u0632\u064a\u0627\u0631\u0629 \u0646\u0647\u0627\u0631\u064a\u0629 \u0625\u0644\u0649 \u062d\u062f\u064a\u0642\u0629 \u0639\u0627\u0645\u0629 \u0639\u0646\u062f\u0645\u0627 \u062a\u0639\u0631\u0636\u0648\u0627 \u0644\u0644\u0647\u062c\u0648\u0645.\n\n---\n\n### \u0634\u0631\u0643\u0629 \u0623\u0628\u0644 \u062a\u062c\u0644\u0628 ChatGPT \u0625\u0644\u0649 \u0623\u062c\u0647\u0632\u0629 iPhone \u0641\u064a \u062a\u062d\u062f\u064a\u062b \u0634\u0627\u0645\u0644\n\n- \u0643\u0627\u0646\u062a \u0627\u0644\u0634\u0631\u0643\u0629 \u0623\u0628\u0637\u0623 \u0641\u064a \u0637\u0631\u062d \u0645\u064a\u0632\u0627\u062a \u0627\u0644\u0630\u0643\u0627\u0621 \u0627\u0644\u0627\u0635\u0637\u0646\u0627\u0639\u064a \u0627\u0644\u062a\u0648\u0644\u064a\u062f\u064a \u0645\u0642\u0627\u0631\u0646\u0629 \u0628\u0645\u0646\u0627\u0641\u0633\u064a\u0647\u0627 \u0645\u062b\u0644 \u062c\u0648\u062c\u0644 \u0648\u0645\u0627\u064a\u0643\u0631\u0648\u0633\u0648\u0641\u062a.\n\n---\n\n### \u0637\u0627\u0626\u0631\u0629 \u062a\u0642\u0644 \u0646\u0627\u0626\u0628 \u0631\u0626\u064a\u0633 \u0645\u0644\u0627\u0648\u064a \u062a\u062e\u062a\u0641\u064a\n\n- \u0643\u0627\u0646 \u0633\u0627\u0648\u0644\u0648\u0633 \u062a\u0634\u064a\u0644\u0645\u0627 \u0648\u062a\u0633\u0639\u0629 \u0622\u062e\u0631\u0648\u0646 \u0639\u0644\u0649 \u0645\u062a\u0646 \u0637\u0627\u0626\u0631\u0629 \u0639\u0633\u0643\u0631\u064a\u0629 \u0627\u062e\u062a\u0641\u062a \u0639\u0646 \u0627\u0644\u0631\u0627\u062f\u0627\u0631 \u0635\u0628\u0627\u062d \u0627\u0644\u0627\u062b\u0646\u064a\u0646.\n\n---\n\n### \u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u0628\u0627\u0646\u0643\u0648\u0643 \u064a\u0642\u062a\u0644 1,000 \u062d\u064a\u0648\u0627\u0646\n\n- \u0627\u0644\u062d\u0631\u064a\u0642 \u0641\u064a \u0633\u0648\u0642 \u062a\u0634\u0627\u062a\u0648\u0634\u0627\u0643 \u0627\u0644\u0634\u0647\u064a\u0631 \u0641\u064a \u062a\u0627\u064a\u0644\u0627\u0646\u062f \u0623\u0633\u0641\u0631 \u0639\u0646 \u0645\u0642\u062a\u0644 \u0627\u0644\u0643\u0644\u0627\u0628 \u0648\u0627\u0644\u0637\u064a\u0648\u0631 \u0648\u0627\u0644\u062b\u0639\u0627\u0628\u064a\u0646 \u0648\u0627\u0644\u0642\u0637\u0637.\n\n---\n\n### \u0642\u0646\u0627\u0629 \u0634\u062d\u0646 \u0628\u0627\u0644\u062a\u064a\u0645\u0648\u0631 \u062a\u0639\u064a\u062f \u0641\u062a\u062d\u0647\u0627 \u0628\u0639\u062f \u0627\u0646\u0647\u064a\u0627\u0631 \u062c\u0633\u0631\n\n- \u0642\u0627\u0644\u062a \u0642\u0648\u0627\u062a \u0627\u0644\u0645\u0647\u0646\u062f\u0633\u064a\u0646 \u0627\u0644\u062a\u0627\u0628\u0639\u0629 \u0644\u0644\u062c\u064a\u0634 \u0627\u0644\u0623\u0645\u0631\u064a\u0643\u064a \u0625\u0646 \u0627\u0644\u0645\u0646\u0637\u0642\u0629 \u0623\u0635\u0628\u062d\u062a \"\u0622\u0645\u0646\u0629 \u0644\u0644\u0645\u0644\u0627\u062d\u0629\" \u0628\u0639\u062f \u062d\u0648\u0627\u0644\u064a \u062b\u0644\u0627\u062b\u0629 \u0623\u0634\u0647\u0631 \u0645\u0646 \u0627\u0644\u0643\u0627\u0631\u062b\u0629.\n``` \n\nThis Arabic summary captures the main stories shown on the BBC news homepage in your image.",
"Criteria": "There are seven titles in total, with a total of 10 points. 1 point will be deducted for a title translation error. 3 points will be deducted for markdown format errors."
}
]
}
Loading

0 comments on commit 052517c

Please sign in to comment.