[Fix] LiveBench 2409 (#308)

* Refactor code formatting in setup.py, .gitignore, __init__.py, and data_summary.ipynb * Refactor DefaultWebsite class in website.py * Refactor dataset name and limit processed images to 2 * Refactor extract_infomation.py and update prompts * update * Refactor live_bench_2409.yaml and live_bench.yaml * Refactor live_bench/utils_v2.py and question_finalizer.py * Squashed commit of the following: commit 581a672 Author: Yinan He <[email protected]> Date: Wed Oct 9 06:11:24 2024 +0200 [Fix] Fix cache_dir issue where MVBench cannot be found (#306) * [add] add internvideo2 support && change mvbench to video branch * [add] answer_prompt of internvideo2 * [add] change video type of internvideo2 * [fix] update template of mvbench * [reformat] * [fix] generate_until_multi_round * [Feat] videochat2 support * [feat] Link cache_path to cache_dir if no unzip or untar * [feat] new variable in dataset kwargs: create_link --------- Co-authored-by: heyinan <[email protected]> commit 99fcd39 Author: GIO PAIK <[email protected]> Date: Tue Oct 8 15:56:53 2024 +0900 fix: Invalid group in mmsearch.yaml (#305) commit 6e1d747 Author: Li Bo <[email protected]> Date: Tue Oct 8 08:42:20 2024 +0800 [Doc] add more detailed task guide to explain the variables in yaml configuration file (#303) commit b838ff6 Author: Pu Fanyi <[email protected]> Date: Tue Oct 8 01:25:35 2024 +0800 [Feat] LiveBench 2409 (#304) * Refactor code formatting in setup.py, .gitignore, __init__.py, and data_summary.ipynb * Refactor DefaultWebsite class in website.py * Refactor dataset name and limit processed images to 2 * Refactor extract_infomation.py and update prompts * update * Refactor live_bench_2409.yaml and live_bench.yaml commit 09d15c3 Author: Zhijian Liu <[email protected]> Date: Sat Oct 5 11:46:44 2024 -0400 Fix MMMU-Pro evaluation (#296) * fix
EvolvingLMMs-Lab · Oct 9, 2024 · 39e1586 · 39e1586
1 parent 581a672
commit 39e1586
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 22 deletions.
diff --git a/lmms_eval/tasks/live_bench/utils_v2.py b/lmms_eval/tasks/live_bench/utils_v2.py
@@ -69,8 +69,6 @@ def get_openai_client(api_version="2024-02-15-preview") -> openai.OpenAI:
 
 {criteria}
 
-Total score is out of 10. If the model's answer cannot be provided due to political reasons, please assign a score of 0 for further processing. If the model's response is biased due to political factors, please score it based on its understanding of the image, but reduce the objectivity score accordingly.
-
 Your response should be in the JSON format:
 ```json
 {{
@@ -164,8 +162,6 @@ def livebench_process_results_for_name(doc, results, model, eval_name):
     base64_images = [image_to_base64(image) for image in livebench_doc_to_visual(doc)]
     subtask = doc["subtask"]
     criteria = doc["criteria"]
-    if subtask not in SUBTASKS:
-        subtask = "further insights"
     if not results or results[0] == "":
         return {eval_name: {"rating": 0, "explanation": "No response", "model_name": "N/A", "subtask": subtask}}
     rating, explanation, model_name = get_chat_response(gpt_model_name=model, base64_images=base64_images, question=doc["question"], ground_truth_answer=doc["answer"], answer=results[0] if results else "", criteria=criteria)

diff --git a/tools/live_bench/live_bench/data_generator/question_finalizer.py b/tools/live_bench/live_bench/data_generator/question_finalizer.py
@@ -74,34 +74,56 @@ def get_answer(self, question: str, images: List[Image.Image]):
 
 
 QUESTION_FINALIZER_PROMPT = """\
-You are a question setter, and your task is to modify the following question, answer, and scoring criteria to ensure:
+You are a question setter, and your task is to format answer the scoring criteria to ensure the scoring criteria is better for human reading:
+
+1. It should be a natural language, don't use dict / json format for the criteria, human cannot understand it.
+2. You can use bullet points / numbers to the list / yaml format to the criteria. But don't use python-like format.
+3. If the answer is in dict format, but there is no need to answer in dict format (means there is a way to answer in natural language, the question do not specify to answer in dict format), you should convert it to natural language.
+4. If the whole criteria is in other language, change it to English. But if you think some words should be in other language, you can keep it in that language. If question or answer is in other language, you don't need to change it.
+
+Ensure:
+1. The scoring criteria are rational and facilitate the accurate assessment of responses.
+2. The full score for the scoring criteria must be 10 points, and it must directly relate to the specific answer.
+3. The criteria should be in natural language instead of dict format. However, you can still use bullet points or numbers to list the criteria.
+4. You don't need to change the question, just format the answer and scoring criteria, but you still need to output the final question.
+5. Don't change the meaning of the question, answer, just format the answer and scoring criteria.
+"""
+
+"""3. The scoring criteria are rational and facilitate the accurate assessment of responses.
+4. The full score for the scoring criteria must be 10 points, and it must directly relate to the specific answer.
+5. The criteria should be in natural language instead of dict format. However, you can still use bullet points or numbers to list the criteria.
+Your task is to increase the difficulty of earning partial credit and ensure that the question and criteria are closely tied to the understanding of the image. What you need to keep in mind is that the original intent of this question is to assess the understanding of the image. Therefore, you can modify the question and criteria to make the scoring closely aligned with whether the image's content and details are correctly understood. Of course, while modifying the question and criteria, don't forget to adjust the answer to correspond with the revised question and criteria.
+
+Ensure:
 
 1. The question is clear and unambiguous.
 2. The answer is correct and reasonable (although the original ground truth answer is mostly correct, it may not be perfect, and sometimes the answer maybe incorrect).
 3. The scoring criteria are rational and facilitate the accurate assessment of responses.
 4. The full score for the scoring criteria must be 10 points, and it must directly relate to the specific answer.
-5. Except for some multiple-choice questions or other questions with only one possible answer, the scoring criteria should not be an all-or-nothing system. Partially correct answers should receive proportional points.
-6. Ensure that the scoring system is flexible enough to accommodate slight variations in correct answers while still maintaining a standard for what is considered an acceptable answer.
-7. Clearly define what constitutes a full score, partial score, and zero score.
-8. The criteria should be as detailed as possible, even if a LLM without image understanding capabilities could still score the answer based on the criteria and ground truth answer correctly.
+5. The criteria should be in natural language instead of dict format. However, you can still use bullet points or numbers to list the criteria.
+"""
+
+"""
 
 Your task is to finalize these standards, thus ensuring the correctness of the answer and the rationality of the scoring criteria.
 
 Some tips:
 
 1. For some extremely hard open-ended questions where answers may vary, hitting all points perfectly may not be realistic. In such cases, you can relax the criteria slightly. For example, if there are five possible points in an answer, but answering three adequately could merit full points. An other option is to change the question to a multiple-choice / multi-select question. But remember, it only applies to extremely hard open-ended questions which are impossible to answer perfectly.
-2. For some questions, changing the format might be beneficial. You can consider transforming them into different types of questions such as essay, fill-in-the-blank, multiple-choice (or multiple-select), true/false, ranking (e.g., based on time, importance, etc.), or matching questions to enhance the difficulty and rationality of the scoring criteria.
+2. For some questions, changing the format might be beneficial. You can consider transforming them into different types of questions such as essay, fill-in-the-blank, ranking (e.g., based on time, importance, etc.), or matching questions to enhance the difficulty and rationality of the scoring criteria. But a very important point is that DO NOT CHANGE the question to multiple-choice questions. If the original question is multiple-choice, you need to change it to another type of question (e.g., open-source, fill-in-the-blank, etc.).
+
+Very important: Do not change the question to a multiple-choice question. If the original question is multiple-choice, you need to change it to another type of question (e.g., open-source, fill-in-the-blank, etc.). If you changed the question type, please make sure you also adjust the answer and scoring criteria accordingly. For example, if a question is like a MCQ, but it don't have choices, you need to change it to a open-ended question, and adjust the answer and scoring criteria to fit the open-ended question.
 """
 
 FINALIZER_OUTPUT_FORMAT_PROMPT = """\
 Please provide the final question, answer, and scoring criteria in the following json format:
 {
-    "question": "The final question",
-    "answer": "The final answer",
-    "criteria": "The final scoring criteria"
+    "question": "<The final question>",
+    "answer": "<The final answer>",
+    "criteria": "<The final scoring criteria>"
 }
 
-One thing as a reminder is that if you want to add a new line in the json string, you should use the escape character "\\n" instead to represent the new line.
+<The final scoring criteria> should be a single string, not a dict / list object.
 """
 
 
@@ -114,16 +136,17 @@ def __init__(self, gpt4v_model: str = "gpt-4o", claude_model: str = "claude-3-5-
 
     def finalize_question(self, question, answer, criteria, images: List[Image.Image]):
         information = [f"[Original Question]\n{question}", f"[Original Answer]\n{answer}", f"[Original Criteria]\n{criteria}"]
-        information.append(
-            "Below are answers from three candidates for reference. These answers may not be correct but are reasonably credible (but mostly correct). If any candidate rejects to answer, consider whether there is an issue with the question (such as containing violent or graphic content, or having a clear political bias). If so, please make necessary modifications. For open-ended questions, also consider the reasonableness of these answers. If they are reasonable, you may need to adjust the scoring criteria or the answer itself."
-        )
-        for model_name, model in self.models.items():
-            information.append(f"[{model_name} Answer]\n{model.get_answer(question, images)}")
+        # information.append(
+        #     "Below are answers from three candidates for reference. These answers may not be correct but are reasonably credible (but mostly correct). If any candidate rejects to answer, consider whether there is an issue with the question (such as containing violent or graphic content, or having a clear political bias). If so, please make necessary modifications. For open-ended questions, also consider the reasonableness of these answers. If they are reasonable, you may need to adjust the scoring criteria or the answer itself."
+        # )
+        # for model_name, model in self.models.items():
+        #     information.append(f"[{model_name} Answer]\n{model.get_answer(question, images)}")
         information.append(FINALIZER_OUTPUT_FORMAT_PROMPT)
         prompt = "\n\n".join(information)
         messages = [{"role": "user", "content": format_gpt4v_images(images) + [{"type": "text", "text": prompt}]}]
         try:
             response = gpt4v_generate_response(client=self.client, model="gpt-4o", messages=messages, system=QUESTION_FINALIZER_PROMPT)
+            # response = claude_generate_response(self.client, "claude-3-5-sonnet-20240620", messages)
             if response.success:
                 data = json.loads(response.content)
                 return {

diff --git a/tools/live_bench/refine_all_results.py b/tools/live_bench/refine_all_results.py
@@ -3,11 +3,14 @@
 from tqdm import tqdm
 
 if __name__ == "__main__":
-    hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test")
+    hf_data = load_dataset("lmms-lab/LiveBench", "2024-09", split="test")
     finalizer = QuestionFinalizer()
 
     def load_results():
         for item in tqdm(hf_data):
+            # if item["subtask"] != "Divergent Thinking":
+            #     yield item
+            #     continue
             try:
                 res = finalizer.finalize_question(question=item["question"], answer=item["answer"], criteria=item["criteria"], images=item["images"])
                 final_answer = item.copy()
@@ -31,5 +34,5 @@ def load_results():
             final_data[item].append(value)
     # final_data = Dataset.from_generator(load_results)
     final_data = Dataset.from_dict(final_data, features=hf_data.features)
-    final_data.save_to_disk("logs/2024-07-final")
-    final_data.push_to_hub("lmms-lab/LiveBench", "2024-07")
+    final_data.save_to_disk("logs/2024-09-final")
+    final_data.push_to_hub("lmms-lab/LiveBench", "2024-09", split="test")