From f20c305dc743eb545a57fd19b3b59426b9171465 Mon Sep 17 00:00:00 2001 From: Erik Ritter Date: Wed, 20 Dec 2023 17:09:36 -0800 Subject: [PATCH] Add MMMU evals and runner (#1442) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Eval details 📑 ### Eval name MMMU ### Eval description A multi-modal version of MMLU published here: https://arxiv.org/pdf/2311.16502.pdf ### What makes this a useful eval? Tests a variety of subjects, along with image recognition and comprehension ## Criteria for a good eval ✅ Below are some of the criteria we look for in a good eval. In general, we are seeking cases where the model does not do a good job despite being capable of generating a good response (note that there are some things large language models cannot do, so those would not make good evals). Your eval should be: - [x] Thematically consistent: The eval should be thematically consistent. We'd like to see a number of prompts all demonstrating some particular failure mode. For example, we can create an eval on cases where the model fails to reason about the physical world. - [x] Contains failures where a human can do the task, but either GPT-4 or GPT-3.5-Turbo could not. - [x] Includes good signal around what is the right behavior. This means either a correct answer for `Basic` evals or the `Fact` Model-graded eval, or an exhaustive rubric for evaluating answers for the `Criteria` Model-graded eval. - [x] **Include at least 15 high-quality examples.** If there is anything else that makes your eval worth including, please document it below. ### Unique eval value Multimodal, covers many subjects ## Eval structure 🏗️ Your eval should - [x] Check that your YAML is registered at `evals/registry/evals/{name}.yaml` - [x] Ensure you have the right to use the data you submit via this eval ### Eval JSON data Dataset defined here: https://huggingface.co/datasets/MMMU/MMMU ### Eval Results on `gpt-4-vision-preview`: ``` { "mmmu-accounting": 0.5333333333333333, "mmmu-agriculture": 0.6333333333333333, "mmmu-architecture-and-engineering": 0.16666666666666666, "mmmu-art": 0.7333333333333333, "mmmu-art-theory": 0.8333333333333334, "mmmu-basic-medical-science": 0.6, "mmmu-biology": 0.43333333333333335, "mmmu-chemistry": 0.43333333333333335, "mmmu-clinical-medicine": 0.6333333333333333, "mmmu-computer-science": 0.6333333333333333, "mmmu-design": 0.7666666666666667, "mmmu-diagnostics-and-laboratory-medicine": 0.3, "mmmu-economics": 0.6333333333333333, "mmmu-electronics": 0.4, "mmmu-energy-and-power": 0.36666666666666664, "mmmu-finance": 0.43333333333333335, "mmmu-geography": 0.4, "mmmu-history": 0.6666666666666666, "mmmu-literature": 0.9, "mmmu-manage": 0.6, "mmmu-marketing": 0.6333333333333333, "mmmu-materials": 0.26666666666666666, "mmmu-math": 0.5, "mmmu-mechanical-engineering": 0.23333333333333334, "mmmu-music": 0.36666666666666664, "mmmu-pharmacy": 0.7666666666666667, "mmmu-physics": 0.43333333333333335, "mmmu-psychology": 0.7, "mmmu-public-health": 0.8, "mmmu-sociology": 0.5666666666666667 } Average accuracy: 0.5455555555555556 ``` Note that this is slightly lower than the MMMU paper's findings of `0.568`. There's likely prompt engineering that could be done to improve this, but I'll leave that as an exercise for later --- evals/elsuite/mmmu/eval.py | 174 ++++++++++++ evals/registry/eval_sets/mmmu.yaml | 3 + evals/registry/evals/mmmu.yaml | 419 +++++++++++++++++++++++++++++ 3 files changed, 596 insertions(+) create mode 100644 evals/elsuite/mmmu/eval.py create mode 100644 evals/registry/eval_sets/mmmu.yaml create mode 100644 evals/registry/evals/mmmu.yaml diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py new file mode 100644 index 0000000000..0dd4cc3bfd --- /dev/null +++ b/evals/elsuite/mmmu/eval.py @@ -0,0 +1,174 @@ +import ast +import base64 +import logging +from io import BytesIO +from typing import Optional, Union +from urllib.parse import parse_qs, urlparse + +from datasets import load_dataset +from PIL import Image +from pydantic import BaseModel + +import evals +import evals.metrics +from evals.api import CompletionFn +from evals.formatting import make_abc +from evals.record import RecorderBase, record_match + +logger = logging.getLogger(__name__) + + +class Sample(BaseModel): + question: str + answers: list[str] + label: Union[int, str] + question_type: str + image_1: Optional[Image.Image] + image_2: Optional[Image.Image] + image_3: Optional[Image.Image] + image_4: Optional[Image.Image] + image_5: Optional[Image.Image] + image_6: Optional[Image.Image] + image_7: Optional[Image.Image] + + class Config: + arbitrary_types_allowed = True + + +def get_dataset(url: str) -> list[Sample]: + parsed = urlparse(url) + query = parse_qs(parsed.query) + query = {k: v[0] for k, v in query.items()} + + dataset = load_dataset("mmmu/mmmu", **query) + + return [ + Sample( + question=sample["question"], + answers=ast.literal_eval(sample["options"]), + label=( + ord(sample["answer"]) - ord("A") + if sample["question_type"] == "multiple-choice" + else sample["answer"] + ), + question_type=sample["question_type"], + image_1=sample["image_1"], + image_2=sample["image_2"], + image_3=sample["image_3"], + image_4=sample["image_4"], + image_5=sample["image_5"], + image_6=sample["image_6"], + image_7=sample["image_7"], + ) + for sample in dataset + ] + + +class MMMU(evals.Eval): + def __init__( + self, + completion_fns: list[CompletionFn], + dataset: str, + subject: str, + *args, + **kwargs, + ): + super().__init__(completion_fns, *args, **kwargs) + assert len(completion_fns) == 1, "MMMU only supports one completion fn" + self.dataset = dataset + self.subject = subject + + def eval_sample(self, sample: Sample, rng): + assert isinstance(sample, Sample) + + if sample.question_type == "multiple-choice": + options, correct_answer = make_abc( + answers=sample.answers, + correct_idx=sample.label, + rng=rng, + ) + prompt = sample.question + "\n" + options + system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X"' + else: + correct_answer = sample.label + prompt = sample.question + system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' + + images = [ + image + for image in [ + sample.image_1, + sample.image_2, + sample.image_3, + sample.image_4, + sample.image_5, + sample.image_6, + sample.image_7, + ] + if image is not None + ] + + base_64_images = [] + for image in images: + buffer = BytesIO() + image.save(buffer, format="PNG") + img_str = base64.b64encode(buffer.getvalue()) + base_64_images.append(img_str.decode()) + + try: + result = self.completion_fn( + prompt=[ + { + "role": "system", + "content": [ + { + "type": "text", + "text": system_prompt, + }, + ], + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": prompt, + }, + ] + + [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{base_64_image}", + }, + } + for base_64_image in base_64_images + ], + }, + ], + temperature=0.0, + max_tokens=4096, + ) + sampled = result.get_completions()[0] + except Exception as e: + logging.info("Sampling failed!") + logging.info(sample) + logging.info(f"Prompt: {prompt}") + logging.info(f"Error: {str(e)}") + sampled = "ERROR: " + str(e) + + match = sampled.find(f"ANSWER: {correct_answer}") != -1 + + record_match( + match, + expected=correct_answer, + picked=(correct_answer if match else None), + sampled=sampled, + ) + + def run(self, recorder: RecorderBase): + samples = get_dataset(self.dataset) + self.eval_all_samples(recorder, samples) + return { + "accuracy": evals.metrics.get_accuracy(recorder.get_events("match")), + } diff --git a/evals/registry/eval_sets/mmmu.yaml b/evals/registry/eval_sets/mmmu.yaml new file mode 100644 index 0000000000..a50ab164fc --- /dev/null +++ b/evals/registry/eval_sets/mmmu.yaml @@ -0,0 +1,3 @@ +mmmu: + evals: + - mmmu-*.validation.v1 \ No newline at end of file diff --git a/evals/registry/evals/mmmu.yaml b/evals/registry/evals/mmmu.yaml new file mode 100644 index 0000000000..6b382c7af9 --- /dev/null +++ b/evals/registry/evals/mmmu.yaml @@ -0,0 +1,419 @@ +mmmu-accounting: + id: mmmu-accounting.validation.v1 + metrics: [accuracy] +mmmu-accounting.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Accounting&split=dev + subject: Accounting +mmmu-accounting.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Accounting&split=validation + subject: Accounting + +mmmu-agriculture: + id: mmmu-agriculture.validation.v1 + metrics: [accuracy] +mmmu-agriculture.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Agriculture&split=dev + subject: Agriculture +mmmu-agriculture.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Agriculture&split=validation + subject: Agriculture + +mmmu-architecture-and-engineering: + id: mmmu-architecture-and-engineering.validation.v1 + metrics: [accuracy] +mmmu-architecture-and-engineering.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Architecture_and_Engineering&split=dev + subject: Architecture and Engineering +mmmu-architecture-and-engineering.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Architecture_and_Engineering&split=validation + subject: Architecture and Engineering + +mmmu-art: + id: mmmu-art.validation.v1 + metrics: [accuracy] +mmmu-art.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Art&split=dev + subject: Art +mmmu-art.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Art&split=validation + subject: Art + +mmmu-art-theory: + id: mmmu-art-theory.validation.v1 + metrics: [accuracy] +mmmu-art-theory.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Art_Theory&split=dev + subject: Art Theory +mmmu-art-theory.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Art_Theory&split=validation + subject: Art Theory + +mmmu-basic-medical-science: + id: mmmu-basic-medical-science.validation.v1 + metrics: [accuracy] +mmmu-basic-medical-science.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Basic_Medical_Science&split=dev + subject: Basic Medical Science +mmmu-basic-medical-science.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Basic_Medical_Science&split=validation + subject: Basic Medical Science + +mmmu-biology: + id: mmmu-biology.validation.v1 + metrics: [accuracy] +mmmu-biology.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Biology&split=dev + subject: Biology +mmmu-biology.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Biology&split=validation + subject: Biology + +mmmu-chemistry: + id: mmmu-chemistry.validation.v1 + metrics: [accuracy] +mmmu-chemistry.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Chemistry&split=dev + subject: Chemistry +mmmu-chemistry.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Chemistry&split=validation + subject: Chemistry + +mmmu-clinical-medicine: + id: mmmu-clinical-medicine.validation.v1 + metrics: [accuracy] +mmmu-clinical-medicine.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Clinical_Medicine&split=dev + subject: Clinical Medicine +mmmu-clinical-medicine.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Clinical_Medicine&split=validation + subject: Clinical Medicine + +mmmu-computer-science: + id: mmmu-computer-science.validation.v1 + metrics: [accuracy] +mmmu-computer-science.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Computer_Science&split=dev + subject: Computer Science +mmmu-computer-science.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Computer_Science&split=validation + subject: Computer Science + +mmmu-design: + id: mmmu-design.validation.v1 + metrics: [accuracy] +mmmu-design.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Design&split=dev + subject: Design +mmmu-design.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Design&split=validation + subject: Design + +mmmu-diagnostics-and-laboratory-medicine: + id: mmmu-diagnostics-and-laboratory-medicine.validation.v1 + metrics: [accuracy] +mmmu-diagnostics-and-laboratory-medicine.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Diagnostics_and_Laboratory_Medicine&split=dev + subject: Diagnostics and Laboratory Medicine +mmmu-diagnostics-and-laboratory-medicine.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Diagnostics_and_Laboratory_Medicine&split=validation + subject: Diagnostics and Laboratory Medicine + +mmmu-economics: + id: mmmu-economics.validation.v1 + metrics: [accuracy] +mmmu-economics.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Economics&split=dev + subject: Economics +mmmu-economics.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Economics&split=validation + subject: Economics + +mmmu-electronics: + id: mmmu-electronics.validation.v1 + metrics: [accuracy] +mmmu-electronics.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Electronics&split=dev + subject: Electronics +mmmu-electronics.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Electronics&split=validation + subject: Electronics + +mmmu-energy-and-power: + id: mmmu-energy-and-power.validation.v1 + metrics: [accuracy] +mmmu-energy-and-power.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Energy_and_Power&split=dev + subject: Energy and Power +mmmu-energy-and-power.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Energy_and_Power&split=validation + subject: Energy and Power + +mmmu-finance: + id: mmmu-finance.validation.v1 + metrics: [accuracy] +mmmu-finance.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Finance&split=dev + subject: Finance +mmmu-finance.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Finance&split=validation + subject: Finance + +mmmu-geography: + id: mmmu-geography.validation.v1 + metrics: [accuracy] +mmmu-geography.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Geography&split=dev + subject: Geography +mmmu-geography.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Geography&split=validation + subject: Geography + +mmmu-history: + id: mmmu-history.validation.v1 + metrics: [accuracy] +mmmu-history.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=History&split=dev + subject: History +mmmu-history.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=History&split=validation + subject: History + +mmmu-literature: + id: mmmu-literature.validation.v1 + metrics: [accuracy] +mmmu-literature.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Literature&split=dev + subject: Literature +mmmu-literature.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Literature&split=validation + subject: Literature + +mmmu-manage: + id: mmmu-manage.validation.v1 + metrics: [accuracy] +mmmu-manage.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Manage&split=dev + subject: Manage +mmmu-manage.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Manage&split=validation + subject: Manage + +mmmu-marketing: + id: mmmu-marketing.validation.v1 + metrics: [accuracy] +mmmu-marketing.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Marketing&split=dev + subject: Marketing +mmmu-marketing.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Marketing&split=validation + subject: Marketing + +mmmu-materials: + id: mmmu-materials.validation.v1 + metrics: [accuracy] +mmmu-materials.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Materials&split=dev + subject: Materials +mmmu-materials.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Materials&split=validation + subject: Materials + +mmmu-math: + id: mmmu-math.validation.v1 + metrics: [accuracy] +mmmu-math.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Math&split=dev + subject: Math +mmmu-math.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Math&split=validation + subject: Math + +mmmu-mechanical-engineering: + id: mmmu-mechanical-engineering.validation.v1 + metrics: [accuracy] +mmmu-mechanical-engineering.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Mechanical_Engineering&split=dev + subject: Mechanical Engineering +mmmu-mechanical-engineering.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Mechanical_Engineering&split=validation + subject: Mechanical Engineering + +mmmu-music: + id: mmmu-music.validation.v1 + metrics: [accuracy] +mmmu-music.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Music&split=dev + subject: Music +mmmu-music.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Music&split=validation + subject: Music + +mmmu-pharmacy: + id: mmmu-pharmacy.validation.v1 + metrics: [accuracy] +mmmu-pharmacy.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Pharmacy&split=dev + subject: Pharmacy +mmmu-pharmacy.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Pharmacy&split=validation + subject: Pharmacy + +mmmu-physics: + id: mmmu-physics.validation.v1 + metrics: [accuracy] +mmmu-physics.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Physics&split=dev + subject: Physics +mmmu-physics.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Physics&split=validation + subject: Physics + +mmmu-psychology: + id: mmmu-psychology.validation.v1 + metrics: [accuracy] +mmmu-psychology.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Psychology&split=dev + subject: Psychology +mmmu-psychology.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Psychology&split=validation + subject: Psychology + +mmmu-public-health: + id: mmmu-public-health.validation.v1 + metrics: [accuracy] +mmmu-public-health.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Public_Health&split=dev + subject: Public Health +mmmu-public-health.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Public_Health&split=validation + subject: Public Health + +mmmu-sociology: + id: mmmu-sociology.validation.v1 + metrics: [accuracy] +mmmu-sociology.dev.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Sociology&split=dev + subject: Sociology +mmmu-sociology.validation.v1: + class: evals.elsuite.mmmu.eval:MMMU + args: + dataset: hf://mmmu/mmmu?name=Sociology&split=validation + subject: Sociology