From ded938273e160275127c29f6871e52466e03eabd Mon Sep 17 00:00:00 2001
From: Erik Ritter <erik.t.ritter@gmail.com>
Date: Sun, 24 Dec 2023 11:22:59 -0800
Subject: [PATCH] Randomly select MMMU answer when none is returned from the
 model (#1447)

This is the behavior MMMU used for evaluating, so we should match this
here.

As an example this increased the mmmu-music benchmark from `0.3666` to
`0.4` as multiple questions in that benchmark were unanswered by the
model
---
 evals/elsuite/mmmu/eval.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py
index 0dd4cc3bfd..53cd4c7866 100644
--- a/evals/elsuite/mmmu/eval.py
+++ b/evals/elsuite/mmmu/eval.py
@@ -159,6 +159,14 @@ def eval_sample(self, sample: Sample, rng):
 
         match = sampled.find(f"ANSWER: {correct_answer}") != -1
 
+        if not match and sampled.find("ANSWER") == -1 and sample.question_type == "multiple-choice":
+            # The model didn't answer anything, so randomly pick an answer
+            # This matches the behavior described in section 4.1 of the MMMU paper: https://arxiv.org/pdf/2311.16502.pdf
+            logging.info("No answer found for multiple choice so picking a random answer.")
+            answer_idx = rng.randint(0, len(sample.answers) - 1)
+            answer_letter = chr(ord("A") + answer_idx)
+            match = correct_answer == answer_letter
+
         record_match(
             match,
             expected=correct_answer,