From ded938273e160275127c29f6871e52466e03eabd Mon Sep 17 00:00:00 2001 From: Erik Ritter Date: Sun, 24 Dec 2023 11:22:59 -0800 Subject: [PATCH] Randomly select MMMU answer when none is returned from the model (#1447) This is the behavior MMMU used for evaluating, so we should match this here. As an example this increased the mmmu-music benchmark from `0.3666` to `0.4` as multiple questions in that benchmark were unanswered by the model --- evals/elsuite/mmmu/eval.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py index 0dd4cc3bfd..53cd4c7866 100644 --- a/evals/elsuite/mmmu/eval.py +++ b/evals/elsuite/mmmu/eval.py @@ -159,6 +159,14 @@ def eval_sample(self, sample: Sample, rng): match = sampled.find(f"ANSWER: {correct_answer}") != -1 + if not match and sampled.find("ANSWER") == -1 and sample.question_type == "multiple-choice": + # The model didn't answer anything, so randomly pick an answer + # This matches the behavior described in section 4.1 of the MMMU paper: https://arxiv.org/pdf/2311.16502.pdf + logging.info("No answer found for multiple choice so picking a random answer.") + answer_idx = rng.randint(0, len(sample.answers) - 1) + answer_letter = chr(ord("A") + answer_idx) + match = correct_answer == answer_letter + record_match( match, expected=correct_answer,