diff --git a/src/aviary/utils.py b/src/aviary/utils.py
index 7fdacd81..35687bd2 100644
--- a/src/aviary/utils.py
+++ b/src/aviary/utils.py
@@ -2,14 +2,27 @@
 import contextlib
 import inspect
 import io
+import random
+import re
+import string
+from ast import literal_eval
+from collections.abc import Awaitable, Callable, Sequence
 from enum import StrEnum
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast
+
+from pydantic import BaseModel, Field, model_validator
+
+try:
+    from litellm import acompletion
+except ImportError:
+    acompletion = None
 
 if TYPE_CHECKING:
     import numpy as np
 
 
-LLM_EVAL_CONFIG = {
+DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
+LLM_BOOL_EVAL_CONFIG = {
     "prompt": (
         "Here is a question, the correct answer to the question, and a proposed answer"
         " to the question. Please tell me if the proposed answer is correct, given the"
@@ -18,11 +31,11 @@
         "\n\nCorrect answer: {correct_answer}"
         "\n\nProposed answer: {proposed_answer}"
     ),
-    "model": "gpt-4o-mini",
+    "model": DEFAULT_EVAL_MODEL_NAME,
     "temperature": 0,
 }
 
-LLM_SCORE_EVAL_CONFIG = LLM_EVAL_CONFIG | {
+LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
     "prompt": (
         "Here is a question, the correct answer to the question, and a rubric for"
         " evaluating the question. Judge the proposed answer based on the given rubric."
@@ -41,6 +54,13 @@ class EvalAnswerMode(StrEnum):
     LLM = "llm"  # Ask an LLM to evaluate
     LLM_SCORE = "llm-score"  # Ask an LLM to evaluate and return the score (normalized)
 
+    def get_default_config(self) -> dict[str, Any]:
+        if self == EvalAnswerMode.LLM:
+            return LLM_BOOL_EVAL_CONFIG
+        if self == EvalAnswerMode.LLM_SCORE:
+            return LLM_SCORE_EVAL_CONFIG
+        return {}
+
 
 def partial_format(value: str, **formats: dict[str, Any]) -> str:
     """Partially format a string given a variable amount of formats."""
@@ -88,6 +108,23 @@ def is_coroutine_callable(obj) -> bool:
     return False
 
 
+async def run_prompt(
+    prompt: str, model: str = DEFAULT_EVAL_MODEL_NAME, temperature: float | None = None
+) -> str:
+    try:
+        response = await acompletion(
+            model=model,
+            temperature=temperature,
+            messages=[{"content": prompt, "role": "user"}],
+        )
+    except TypeError:
+        raise ImportError(
+            "Answer evaluation requires the 'llm' extra for 'litellm'. Please:"
+            " `pip install aviary[llm]`."
+        ) from None
+    return response.choices[0].message.content
+
+
 async def eval_answer(
     proposed: str,
     correct: str,
@@ -101,40 +138,27 @@ async def eval_answer(
     """
     eval_mode = EvalAnswerMode(eval_mode)
     if eval_mode in {EvalAnswerMode.LLM, EvalAnswerMode.LLM_SCORE}:
-        try:
-            from litellm import acompletion
-        except ImportError as e:
-            raise ImportError(
-                "eval_answer requires the 'llm' extra for 'litellm'. Please:"
-                " `pip install aviary[llm]`."
-            ) from e
         if question is None:
             raise ValueError("Question must be provided for LLM evaluation mode.")
-        default_config = (
-            LLM_EVAL_CONFIG
-            if eval_mode == EvalAnswerMode.LLM
-            else LLM_SCORE_EVAL_CONFIG
-        )
+        default_config = eval_mode.get_default_config()
         config = llm_eval_config or default_config
         prompt = cast(str, config.get("prompt", default_config["prompt"])).format(
             question=question,
             correct_answer=correct,
             proposed_answer=proposed,
         )
-        response = await acompletion(
+        response_msg = await run_prompt(
+            prompt,
             model=config.get("model", default_config["model"]),
             temperature=config.get("temperature", default_config["temperature"]),
-            messages=[{"content": prompt, "role": "user"}],
         )
         if eval_mode == EvalAnswerMode.LLM:
             return await eval_answer(
-                response.choices[0].message.content.strip().casefold(),
-                "yes",
-                eval_mode=EvalAnswerMode.EXACT,
+                response_msg.strip().casefold(), "yes", eval_mode=EvalAnswerMode.EXACT
             )
         try:
-            return float(response.choices[0].content.strip()) / float(
-                config.get("max_score", default_config["max_score"])  # type: ignore[arg-type]
+            return float(response_msg.strip()) / float(
+                config.get("max_score", default_config["max_score"])
             )
         except ValueError:
             return 0
@@ -149,3 +173,182 @@ async def eval_answer(
         return float(gt in pred)
 
     raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")
+
+
+_CAPITAL_A_INDEX = ord("A")
+
+
+class MultipleChoiceQuestion(BaseModel):
+    QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}"
+    # TODO: combine with above eval_answer and its prompts
+    EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = (
+        "Given the following question and a proposed answer to the question, return the"
+        " single-letter choice in the question that matches the proposed answer."
+        " If the proposed answer is blank or an empty string,"
+        " or multiple options are matched, respond with '0'."
+        "\n\nQuestion: {qa_prompt}"
+        "\n\nProposed Answer: {qa_answer}"
+        "\n\nSingle Letter Answer:"
+    )
+    DEFAULT_UNSURE_OPTION: ClassVar[str] = (
+        "Insufficient information to answer this question"
+    )
+    SEED_USING_QUESTION: ClassVar[Literal["SEED_USING_QUESTION"]] = (
+        "SEED_USING_QUESTION"
+    )
+
+    question: str = Field(
+        description="Question to answer (without multiple choice options)."
+    )
+    options: Sequence[str] = Field(description="All multiple choice options.")
+    ideal_answer: str = Field(
+        description=(
+            "Desired ideal answer. If not one of the provided options, it will be"
+            " automatically added."
+        )
+    )
+    unsure_answer: str | None = Field(
+        default=DEFAULT_UNSURE_OPTION,
+        description=(
+            "Unsure answer text. If not one of the provided options, it will be"
+            " automatically added."
+        ),
+    )
+    shuffle_seed: int | Literal["SEED_USING_QUESTION"] | None = Field(
+        default=None,
+        description=(
+            "Optional seed to use in randomization of options, where seeding is not"
+            " global (e.g. no `random.seed`). Optionally pass in the string literal"
+            " 'SEED_USING_QUESTION' to hash the question for the seed"
+        ),
+    )
+
+    @model_validator(mode="after")
+    def add_answers_and_shuffle(self) -> Self:
+        if self.ideal_answer not in self.options:
+            self.options = [*self.options, self.ideal_answer]
+        if self.unsure_answer and self.unsure_answer not in self.options:
+            self.options = [*self.options, self.unsure_answer]
+        if len(self.options) > len(string.ascii_lowercase):
+            raise NotImplementedError(
+                "Didn't handle more multiple choice options than letters, options were"
+                f" {self.options}."
+            )
+        if self.shuffle_seed == self.SEED_USING_QUESTION:
+            self.shuffle_seed = hash(self.question)
+        if self.shuffle_seed is not None:
+            self.options = random.Random(self.shuffle_seed).sample(
+                self.options, k=len(self.options)
+            )
+            # Ensure deserialization doesn't re-shuffle
+            self.shuffle_seed = None
+        return self
+
+    @property
+    def ideal_answer_index(self) -> int:
+        return self.options.index(self.ideal_answer)
+
+    @property
+    def unsure_answer_index(self) -> int | None:
+        if self.unsure_answer is None:
+            return None
+        return self.options.index(self.unsure_answer)
+
+    @property
+    def question_prompt(self) -> str:
+        return self.QUESTION_PROMPT_TEMPLATE.format(
+            question=self.question,
+            options="\n".join([
+                f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(self.options)
+            ]),
+        )
+
+    @staticmethod
+    def split_options(options: str) -> list[str]:
+        """Split options string into a list of options.
+
+        Examples:
+            >>> MultipleChoiceQuestion.split_options("apples, mangos")
+            ['apples', 'mangos']
+        """
+        try:
+            split_options = literal_eval(options)
+            if not isinstance(split_options, list):
+                raise TypeError("Need split_options to be a list.")  # noqa: TRY301
+        except (ValueError, SyntaxError, TypeError):
+            split_options = [d.strip("'[ ]\"") for d in options.split(",")]
+        return split_options
+
+    async def grade(
+        self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None
+    ) -> "tuple[MultipleChoiceEvaluation, str, str]":
+        if prompt_runner is None:
+            prompt_runner = run_prompt
+        eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
+            qa_prompt=self.question_prompt, qa_answer=answer
+        )
+        raw_evaluation = await prompt_runner(eval_prompt)
+        evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer(
+            raw_evaluation, self
+        )
+        return evaluation, raw_evaluation, parsed_answer
+
+
+class MultipleChoiceEvaluation(StrEnum):
+    CORRECT = "correct"
+    INCORRECT = "incorrect"
+    UNSURE = "unsure"  # May be irrelevant if no unsure option provided
+
+    @classmethod
+    def calculate_accuracy_precision(
+        cls, evaluations: Sequence[Self | str]
+    ) -> tuple[float, float]:
+        """
+        Calculate QA-specific accuracy and precision metrics upon evaluations.
+
+        Raises:
+            ZeroDivisionError: if an empty input.
+
+        Returns:
+            Two-tuple of accuracy = (num correct) / (num questions) and
+                precision = (num correct) / ((num questions) - (num unsure)).
+        """
+        evaluations = [e if isinstance(e, cls) else cls(e) for e in evaluations]
+        num_correct = sum(e == cls.CORRECT for e in evaluations)
+        accuracy = num_correct / len(evaluations)
+        precision = num_correct / sum(
+            e in {cls.CORRECT, cls.INCORRECT} for e in evaluations
+        )
+        return accuracy, precision
+
+    @classmethod
+    def from_answer(
+        cls, answer: str, question: MultipleChoiceQuestion
+    ) -> "tuple[MultipleChoiceEvaluation, str]":
+        """Make an evaluation from the input answer and multiple choice question.
+
+        Returns:
+            Two-tuple of answer enum and the raw answer extracted from the input answer.
+        """
+        # SEE: https://regex101.com/r/vcE9Hb/1
+        letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL)
+        # Get the letter answer, or fail over to the first non-whitespace char
+        answer_char = (
+            letter_search.group(1)
+            if letter_search is not None
+            else answer.split()[0][0].upper()
+        )
+        answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX
+        if answer_letter_index < 0 or answer_letter_index > len(question.options):
+            # The result extracted was not in the options (e.g. '0')
+            return cls.INCORRECT, answer_char
+        # From here, if we don't match either the ideal or the unsure multiple choice
+        # options then we declare the answer as incorrect.
+        if (
+            question.unsure_answer_index is not None
+            and answer_letter_index == question.unsure_answer_index
+        ):
+            return cls.UNSURE, cast(str, question.unsure_answer)
+        if answer_letter_index == question.ideal_answer_index:
+            return cls.CORRECT, question.ideal_answer
+        return cls.INCORRECT, question.options[answer_letter_index]
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
new file mode 100644
index 00000000..be2df091
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle
+        Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "513"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k
+          u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw
+          47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK
+          RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie
+          H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu
+          tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6
+          AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fde1cf88cf1b-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:29 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "363"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999874"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_aff8daa48aa43d3df077f97da6136e5a
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
new file mode 100644
index 00000000..38077163
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
+        is 14004\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "536"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJLT8MwEITv+RWWzw1KS2lCb3BCSDykcuAhFBl7kxocr2Vveajqf0dO
+          2yQVIHHJYb6dyewm64QxrhWfMy6XgmTjTHpWPV6fFzf1Jcwebu4WF4tbunKFwvzj/n3GR9GBL68g
+          ae86ktg4A6TRbrH0IAhi6jg/nk7zLC9OWtCgAhNttaN0iukkm0zTrEizXa5copYQ+Jw9JYwxtm6f
+          saJV8MnnLBvtlQZCEDXweTfEGPdoosJFCDqQsMRHPZRoCWzbOhvqHqpVELGWXRmz0zfdiwzWzuNL
+          2PFOr7TVYVl6EAFtDA2Ejrd0kzD23C60OujIncfGUUn4BjYGjscn2zzen3BAd4yQhBmaZqNf4koF
+          JLQJg4twKeQSVG/tzydWSuMAJIOlf5b5LXu7uLb1f+J7ICU4AlU6D0rLw4X7MQ/xB/trrDtyW5iH
+          r0DQlJW2NXjn9fYbV64U+akqlJDjiieb5BsAAP//AwBRMcSQ7AIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdc63fbf9e53-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:25 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "212"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999868"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_afd8c66d84f3b42a8cd2b8a6bf855054
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
new file mode 100644
index 00000000..057ef1d0
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: \n\nSingle
+        Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "517"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFLLTsMwELznKyyfG+S2gaa5FegFJKRKnEAocp1NaurYlr3hoar/jpyG
+          JBUgcfFhZmc8O/YhIoTKgmaEih1HUVsVr8qnh/XbzfvdI7LN7WzF9vf8Mik31xtYN3QSFGb7CgK/
+          VRfC1FYBSqNPtHDAEYLrdDFPkgVbpGlL1KYAFWSVxTgx8YzNkpilMbvqhDsjBXiakeeIEEIO7Rki
+          6gI+aEbY5BupwXteAc36IUKoMyoglHsvPXKNdDKQwmgE3aZmY9xB2XgeYulGqQ4/9hcpU1lntr7j
+          e7yUWvpd7oB7o4OpR2Npyx4jQl7ahZqzjNQ6U1vM0exBB8MpW5786FDhiO04NMjVCJ52LZzb5QUg
+          l8qPGqGCix0Ug3SojzeFNCMiGi39M8xv3qfFpa7+Yz8QQoBFKHLroJDifOFhzEH4YH+N9SW3gan/
+          9Ah1XkpdgbNOnt64tPmy5Fu+LNk8pdEx+gIAAP//AwDTwVpp7AIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fddcea1d251d-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:28 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "174"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999872"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_4d40eb2c66dfd308a7b75c7cd80c405b
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
new file mode 100644
index 00000000..a0acce15
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: \n\nSingle Letter
+        Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "511"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJRT4MwFIXf+RVNn4eBicPx5oszWTJjfNBoDOnaC9SVtmlLMrPsv5sW
+          HCzOxBceznfP4dwLhwghzBkuEKYNcbTVIr6r3jar1eb5IXtKb7brPL9/3e/a5fxFrOtHPPMOtf0E
+          6n5cV1S1WoDjSvaYGiAOfGqaX2dZnuTLJIBWMRDeVmsXZyqeJ/MsTm7jZDEYG8UpWFyg9wghhA7h
+          6StKBntcoBATlBasJTXg4jSEEDZKeAUTa7l1RDo8GyFV0oEMrZOpbqDqLPG1ZCfEoB9PLxKq1kZt
+          7cBPesUlt01pgFglfah1SuNAjxFCH2Gh7qwj1ka12pVO7UD6wDRZ9Hl4POGEDswpR8TUlM8uxJUM
+          HOHCTi6CKaENsNE6no90jKsJiCZL/y5zKbtfnMv6P/EjoBS0A1ZqA4zT84XHMQP+B/tr7HTkUBjb
+          L+ugLSsuazDa8P4bV7ok+ZLdMkLTCkfH6BsAAP//AwBwbnWk7AIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fde81f05ceb1-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:30 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "332"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999875"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_817ca7ae018d7baa48236c7ad4f4f151
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
new file mode 100644
index 00000000..d70cc972
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml
@@ -0,0 +1,105 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What method
+        was used to demonstrate that the enzyme PafA is stable after incubation with
+        4M urea for 14 days?\n\nOptions:\nA) cryo EM\nB) Insufficient information to
+        answer this question\nC) NMR\nD) x-ray crystallography\nE) circular dichroism\n\nProposed
+        Answer: \n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "624"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJLNboMwEITvPIXlM1SEkJJyS9VIlfpz6SmpKuSYhTg1tmVvpEZR3r0y
+          IUDVVOqFw3w7w+zCMSCEipLmhPItQ94YGS2q9eujmL9NXtaJWKW7wxM+r+TD8rC4ny5p6B16swOO
+          F9cN142RgEKrM+YWGIJPnWTTNM3i7G7SgkaXIL2tNhilOkriJI3ieRTfdsatFhwczcl7QAghx/bp
+          K6oSvmhO4vCiNOAcq4Hm/RAh1GrpFcqcEw6ZQhoOkGuFoNrW8Vi3UO0d87XUXspOP/Uvkro2Vm9c
+          x3u9Ekq4bWGBOa18qENtaEtPASEf7UL7Hx2psboxWKD+BOUDJ9PknEeHE45ox1Ajk2PTNLwSV5SA
+          TEg3ugjljG+hHKzD+di+FHoEgtHSv8tcyz4vLlT9n/gBcA4GoSyMhVLwnwsPYxb8D/bXWH/ktjB1
+          B4fQFJVQNVhjxfkbV6aosvkMNrMqzWhwCr4BAAD//wMANO06tewCAAA=
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdedda0ceb36-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:31 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "259"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999845"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_223a9415a5a19029f86768ffbabf3d6f
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
new file mode 100644
index 00000000..7f73abaa
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
+        is 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "536"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6ggoXlwyyGKcql6Sx+qkLEXcGtsyzZSoyj/vTKQ
+          QNRU6oXDfDvD7MIpQAhzhjOEaU0cbbSINuXb02Z73L287ndH1ias2C8k3a0OB1Y949A7VPEJ1F1c
+          D1Q1WoDjSvaYGiAOfGqynKfpMl6u0g40ioHwtkq7KFXRLJ6lUbyK4sVgrBWnYHGG3gOEEDp1T19R
+          MvjGGYrDi9KAtaQCnF2HEMJGCa9gYi23jkiHwxFSJR3IrvV2qhsoW0t8LdkKMejn64uEqrRRhR34
+          VS+55LbODRCrpA+1Tmnc0XOA0Ee3UHvTEWujGu1yp75A+sAkeezz8HjCCR2YU46IqWkR3onLGTjC
+          hZ1cBFNCa2CjdTwfaRlXExBMlv5d5l52vziX1X/iR0ApaAcs1wYYp7cLj2MG/A/219j1yF1hbI/W
+          QZOXXFZgtOH9Ny51vi5JQdZlPF/h4Bz8AAAA//8DAKuPA4PsAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdc06b78cf26-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:24 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "291"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999868"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_89e3d88c7f12861d7e774e452300b36d
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
new file mode 100644
index 00000000..45376f48
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
+        is 94106\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "536"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAA4ySX2uDMBTF3/0UIc912M7WzreVjv1hDPo06BiSJlfNFpMsibBS+t1H1KplHezF
+          h/O753ju1UOAEOYMpwjTkjhaaRHe5tuX1erpfsv3z8s4ed1sHvL1191+UT9WDk+8Q+0+gLqT64qq
+          SgtwXMkWUwPEgU+dJtdxnETJct6ASjEQ3lZoF8YqnEWzOIyWYbTojKXiFCxO0VuAEEKH5ukrSgbf
+          OEXR5KRUYC0pAKf9EELYKOEVTKzl1hHZ1u0gVdKBbFqvx7qBvLbE15K1EJ1+7F8kVKGN2tmO93rO
+          JbdlZoBYJX2odUrjhh4DhN6bheqzjlgbVWmXOfUJ0gdOp/M2Dw8nHNGOOeWIGJsWkwtxGQNHuLCj
+          i2BKaAlssA7nIzXjagSC0dK/y1zKbhfnsvhP/AAoBe2AZdoA4/R84WHMgP/B/hrrj9wUxnZvHVRZ
+          zmUBRhvefuNcZyS5YUtG6DTHwTH4AQAA//8DAK9WW8vsAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdcb6d3b645e-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:26 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "282"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999868"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_0f4462cd5dd31fe3e1a9d6847e563042
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
new file mode 100644
index 00000000..abe7094e
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer
+        is 94106 or 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "545"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jFJBS8MwGL33V4ScV+m2sm69CaLTgyjIDhMpWfK1zZYmIUnFMfbfJW3X
+          dqjgJYf3vvfyvpecAoQwZzhFmJbE0UqL8DbfPt8V+4U8bO6PL0/rTUnWD8XrVr59PkZ44hVqtwfq
+          LqobqiotwHElW5oaIA686zSZx3ESJcukISrFQHhZoV0Yq3AWzeIwWobRohOWilOwOEXvAUIInZrT
+          R5QMvnCKoskFqcBaUgBO+yGEsFHCI5hYy60j0uHJQFIlHcgmdTTGDeS1JT6WrIXo8HN/kVCFNmpn
+          O77Hcy65LTMDxCrpTa1TGjfsOUDoo1movsqItVGVdplTB5DecDpdtX54qHDEdpxTjogRPOtauLbL
+          GDjChR01gimhJbBBOtRHasbViAhGS/8M85t3uziXxX/sB4JS0A5Ypg0wTq8XHsYM+A/211hfchMY
+          26N1UGU5lwUYbXj7xrnOVjnZkVUezZc4OAffAAAA//8DAAUNxI3sAgAA
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdd6bc71fa2e-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:27 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "249"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999865"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_72f1a2af642dad2884e52d652e775182
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
new file mode 100644
index 00000000..607cd8a2
--- /dev/null
+++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml
@@ -0,0 +1,104 @@
+interactions:
+  - request:
+      body:
+        '{"messages": [{"content": "Given the following question and a proposed
+        answer to the question, return the single-letter choice in the question that
+        matches the proposed answer. If the proposed answer is blank or an empty string,
+        or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
+        my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer
+        this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: Insufficient
+        information\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
+      headers:
+        accept:
+          - application/json
+        accept-encoding:
+          - gzip, deflate
+        connection:
+          - keep-alive
+        content-length:
+          - "541"
+        content-type:
+          - application/json
+        host:
+          - api.openai.com
+        user-agent:
+          - AsyncOpenAI/Python 1.57.4
+        x-stainless-arch:
+          - arm64
+        x-stainless-async:
+          - async:asyncio
+        x-stainless-lang:
+          - python
+        x-stainless-os:
+          - MacOS
+        x-stainless-package-version:
+          - 1.57.4
+        x-stainless-raw-response:
+          - "true"
+        x-stainless-retry-count:
+          - "1"
+        x-stainless-runtime:
+          - CPython
+        x-stainless-runtime-version:
+          - 3.12.7
+      method: POST
+      uri: https://api.openai.com/v1/chat/completions
+    response:
+      body:
+        string: !!binary |
+          H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6gIoDy4pS+lPeRWVWpVIWMWcGNsyzZSqyj/vTIQ
+          IGoq9cJhvp1hduHoIYRZgVOEaU0sbRQPtuXb/u4pP7w0z69kt3+I6221q/fN/SOJNPadQ+afQO3Z
+          dUNlozhYJkWPqQZiwaUuVnGSrMLVetmBRhbAna1SNkhkEIVREoTrIFwOxloyCgan6N1DCKFj93QV
+          RQFfOEWhf1YaMIZUgNNxCCGsJXcKJsYwY4mw2J8glcKC6FrfznUNZWuIqyVazgf9NL6Iy0ppmZuB
+          j3rJBDN1poEYKVyosVLhjp48hD66hdqLjlhp2SibWXkA4QIXi6jPw9MJZ3RgVlrC56bYvxKXFWAJ
+          42Z2EUwJraGYrNP5SFswOQPebOnfZa5l94szUf0nfgKUgrJQZEpDwejlwtOYBveD/TU2HrkrjM23
+          sdBkJRMVaKVZ/41LlW1KkpNNGcZr7J28HwAAAP//AwBPQ8gX7AIAAA==
+      headers:
+        CF-Cache-Status:
+          - DYNAMIC
+        CF-RAY:
+          - 8f39fdd11ed0175e-SJC
+        Connection:
+          - keep-alive
+        Content-Encoding:
+          - gzip
+        Content-Type:
+          - application/json
+        Date:
+          - Tue, 17 Dec 2024 21:26:27 GMT
+        Server:
+          - cloudflare
+        Transfer-Encoding:
+          - chunked
+        X-Content-Type-Options:
+          - nosniff
+        access-control-expose-headers:
+          - X-Request-ID
+        alt-svc:
+          - h3=":443"; ma=86400
+        openai-organization:
+          - future-house-xr4tdh
+        openai-processing-ms:
+          - "196"
+        openai-version:
+          - "2020-10-01"
+        strict-transport-security:
+          - max-age=31536000; includeSubDomains; preload
+        x-ratelimit-limit-requests:
+          - "10000"
+        x-ratelimit-limit-tokens:
+          - "30000000"
+        x-ratelimit-remaining-requests:
+          - "9999"
+        x-ratelimit-remaining-tokens:
+          - "29999867"
+        x-ratelimit-reset-requests:
+          - 6ms
+        x-ratelimit-reset-tokens:
+          - 0s
+        x-request-id:
+          - req_9dd0f40823dceb910336a862f0513c68
+      status:
+        code: 200
+        message: OK
+version: 1
diff --git a/tests/cassettes/test_eval_answer[llm basic].yaml b/tests/cassettes/test_eval_answer[llm basic].yaml
index 6355f1c6..63f1bb18 100644
--- a/tests/cassettes/test_eval_answer[llm basic].yaml	
+++ b/tests/cassettes/test_eval_answer[llm basic].yaml	
@@ -5,9 +5,9 @@ interactions:
         question, and a proposed answer to the question. Please tell me if the proposed
         answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No
         other output is permitted.\n\nQuestion: Which of the following is most likely
-        true:\n\n A) Piggie, B) Pigeon, C) Gerald\n \n\nCorrect answer: C \n\nProposed
+        true:\n\nA) Piggie, B) Pigeon, C) Gerald\n\n\nCorrect answer: C\n\nProposed
         answer: Based on all factors considered, the most compelling answer is Gerald,
-        C", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
+        C", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
       headers:
         accept:
           - application/json
@@ -16,13 +16,13 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "524"
+          - "516"
         content-type:
           - application/json
         host:
           - api.openai.com
         user-agent:
-          - AsyncOpenAI/Python 1.48.0
+          - AsyncOpenAI/Python 1.57.4
         x-stainless-arch:
           - arm64
         x-stainless-async:
@@ -32,7 +32,7 @@ interactions:
         x-stainless-os:
           - MacOS
         x-stainless-package-version:
-          - 1.48.0
+          - 1.57.4
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
@@ -40,23 +40,24 @@ interactions:
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
-          - 3.12.5
+          - 3.12.7
       method: POST
       uri: https://api.openai.com/v1/chat/completions
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA3SQzW6DMBCE7zyFtedQASVQuKVqTlErtekhVVUhYxbi1tgWdtSfKO9eGQihh158
-          mG9nPLtHjxDgFeQE2J5a1mrhr26f6yx8SjaP6y3fyfjnQS7p/Se73qx2d7BwDlW+I7Nn1xVTrRZo
-          uZIDZh1Siy41TKM0ym6ybNmDVlUonK3R1o+V33LJ/SiIYj9I/fBmdO8VZ2ggJ68eIYQc+9f1lBV+
-          QU6CxVlp0RjaIOTTECHQKeEUoMZwY6m0sLhApqRF2Vd/WW/npMP6YKhrJw9CjPpp+kqoRneqNCOf
-          9JpLbvZFh9Qo6WKNVRp6evIIeetXOvxpCbpTrbaFVR8oXWAYLIc8uFxyRkdmlaVibkr+MxUVWsqF
-          md0FhoJcNpeEYGrZrwnm21hsi5rLBjvd8eFMtS7CsoyTMEnrDLyT9wsAAP//AwDs99+sNAIAAA==
+          H4sIAAAAAAAAAwAAAP//jJI/b8IwEMX3fArLM6kCpPzbGDpQVahSh4pWVWTsS+Li+CzbkaCI7145
+          AZKqVOri4X73nt+dfYwIoVLQBaG8ZJ5XRsXL/G09e3zePyX1YbxZrhzuynKTi9WXW7/SQVDg9hO4
+          v6juOFZGgZeoW8wtMA/BdTgdp+k0mc5GDahQgAqywvg4xXiUjNI4mcXJ5CwsUXJwdEHeI0IIOTZn
+          iKgF7OmCJINLpQLnWAF0cW0ihFpUoUKZc9J5pj0ddJCj9qCb1JuHlz6xkNeOhWC6VupcP12vUlgY
+          i1t35td6LrV0ZWaBOdTB1nk0tKGniJCPZqT6R0pqLFbGZx53oIPhMLlv/Wi3xB49M4+eqb5oMrhh
+          lwnwTCrX2wnljJcgOmm3QFYLiT0Q9Yb+HeaWdzu41MV/7DvAORgPIjMWhOQ/B+7aLIQv9lfbdclN
+          YOoOzkOV5VIXYI2V7SvnJpvnbMvmeTKe0egUfQMAAP//AwAWS34s7gIAAA==
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8c8e093e1c03cf61-SJC
+          - 8f39fdb5cae1158a-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -64,14 +65,14 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 25 Sep 2024 21:16:35 GMT
+          - Tue, 17 Dec 2024 21:26:22 GMT
         Server:
           - cloudflare
         Set-Cookie:
-          - __cf_bm=yrnktGfLlTro_mHAi1eDs07tPAi.UO9vu4j2ZH79Tpg-1727298995-1.0.1.1-P4rF_tY6JKwj5f7PAq_pHP4sB0tH4oW2cKEw9s49GhjJh7xBzJvGDdQhTjpCmBpHQxeJgCvh8dFCe3ZZU6JlqA;
-            path=/; expires=Wed, 25-Sep-24 21:46:35 GMT; domain=.api.openai.com; HttpOnly;
+          - __cf_bm=lVkT7i5qloNOJW3VW5kf8Ohm6U080WiPUv6XirXCoFk-1734470782-1.0.1.1-nAgxt2GizSWkF.auEc_j1tv3Erjbd74Lsh9WJmMaZa_E8fpVuEZ8SsBIqLBHICQDV0sfwSjHgP9mTBHQujl_XA;
+            path=/; expires=Tue, 17-Dec-24 21:56:22 GMT; domain=.api.openai.com; HttpOnly;
             Secure; SameSite=None
-          - _cfuvid=NDOCPHgoZ2Ov2vByfj1bojq4LnSC2lXMWDthSvh6SdE-1727298995191-0.0.1.1-604800000;
+          - _cfuvid=YCWb3aZdtzEmsWTuiPgC.gchnL7jvJLEWh9yvJqAiAw-1734470782603-0.0.1.1-604800000;
             path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
@@ -84,25 +85,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "92"
+          - "124"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "30000"
+          - "10000"
         x-ratelimit-limit-tokens:
-          - "150000000"
+          - "30000000"
         x-ratelimit-remaining-requests:
-          - "29999"
+          - "9999"
         x-ratelimit-remaining-tokens:
-          - "149999876"
+          - "29999876"
         x-ratelimit-reset-requests:
-          - 2ms
+          - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_782956501d1cbc9f3e44b8c9ceee21ec
+          - req_84a9ec4746765b74e4d84610ebc880ad
       status:
         code: 200
         message: OK
diff --git a/tests/cassettes/test_eval_llm_config.yaml b/tests/cassettes/test_eval_llm_config.yaml
index 49310cb6..383479dc 100644
--- a/tests/cassettes/test_eval_llm_config.yaml
+++ b/tests/cassettes/test_eval_llm_config.yaml
@@ -4,8 +4,8 @@ interactions:
         '{"messages": [{"content": "Here is a question, the correct answer to the
         question, and a proposed answer to the question. Please tell me if the proposed
         answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No
-        other output is permitted.\n\nQuestion: What is 25 * 10? \n\nCorrect answer:
-        250 \n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o-mini", "temperature":
+        other output is permitted.\n\nQuestion: What is 25 * 10?\n\nCorrect answer:
+        250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o", "temperature":
         0.5}'
       headers:
         accept:
@@ -15,13 +15,13 @@ interactions:
         connection:
           - keep-alive
         content-length:
-          - "394"
+          - "387"
         content-type:
           - application/json
         host:
           - api.openai.com
         user-agent:
-          - AsyncOpenAI/Python 1.48.0
+          - AsyncOpenAI/Python 1.57.4
         x-stainless-arch:
           - arm64
         x-stainless-async:
@@ -31,31 +31,32 @@ interactions:
         x-stainless-os:
           - MacOS
         x-stainless-package-version:
-          - 1.48.0
+          - 1.57.4
         x-stainless-raw-response:
           - "true"
         x-stainless-retry-count:
-          - "0"
+          - "1"
         x-stainless-runtime:
           - CPython
         x-stainless-runtime-version:
-          - 3.12.5
+          - 3.12.7
       method: POST
       uri: https://api.openai.com/v1/chat/completions
     response:
       body:
         string: !!binary |
-          H4sIAAAAAAAAA3SQS0/DMBCE7/kV1p4blEShedx4VEJwAFQ4IIQiJ9kkBr9kuwhU9b8jpyEpBy4+
-          zLcznt19QAiwFkoCzUBdIzQPLy6fuuI5ij6z7vF6fZff39itYMNt2l89bGDlHap+x8b9us4aJTRH
-          x5Q84sYgdehT4yzJkiIvivMRCNUi97ZeuzBVoWCShUmUpGGUhXE+uQfFGrRQkteAEEL24+t7yha/
-          oCTR6lcRaC3tEcp5iBAwinsFqLXMOiodrBbYKOlQjtVfNttTYrDbWerbyR3nk36Yv+Kq10bVduKz
-          3jHJ7FAZpFZJH2ud0jDSQ0DI27jS7k9L0EYJ7SqnPlD6wHzaCJZDLjCemFOO8hNP/J+natFRxu3J
-          VeBYj8l+CYjmjuOSYL+tQ1F1TPZotGHHI3W6ius6XcfrrCsgOAQ/AAAA//8DAMNNIlYyAgAA
+          H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1ycH+okt0Ja6KUthBzaUowirW2lslZIG0gJefci
+          x4kT2kIvPsy3M55de58wxrXic8ZlLUg2zqR35dvTLF+8PJgRPhLUrtwszWKVPa92M+CD6MD1BiSd
+          XDcSG2eANNojlh4EQUwd5uPJJM/y6bgFDSow0VY5SieYjrLRJM2maXbbGWvUEgKfs/eEMcb27TNW
+          tAp2fM6ywUlpIARRAZ+fhxjjHk1UuAhBBxKW+KCHEi2BbVu/3i8viYdyG0QsZrfGdPrh/CqDlfO4
+          Dh0/66W2OtSFBxHQxthA6HhLDwljH+1K26uW3HlsHBWEn2BjYD47xvH+hj0cdoyQhOnlaXeF67BC
+          AQltwsVFuBSyBtU7+/OJrdJ4AZKLlX92+S37uLa21X/ieyAlOAJVOA9Ky+t9+zEP8Qf7a+x84rYw
+          D1+BoClKbSvwzuvjNy5dIfKZmiohhyVPDsk3AAAA//8DADQLsKzsAgAA
       headers:
         CF-Cache-Status:
           - DYNAMIC
         CF-RAY:
-          - 8c8e093e1c06cf1b-SJC
+          - 8f39fdbac9db643b-SJC
         Connection:
           - keep-alive
         Content-Encoding:
@@ -63,15 +64,9 @@ interactions:
         Content-Type:
           - application/json
         Date:
-          - Wed, 25 Sep 2024 21:16:35 GMT
+          - Tue, 17 Dec 2024 21:26:23 GMT
         Server:
           - cloudflare
-        Set-Cookie:
-          - __cf_bm=pym14eCUZA0SXlddekdvYvv4El15fQ5PcElgWShT4_0-1727298995-1.0.1.1-j7XxNBD437f6YvFS5ZFNifdIzdOW1J.vAbnfSMGsYibmP7pxa4RBjnc1c9T7BMJR8ctMw5kl28D5nFYJGkS7Zw;
-            path=/; expires=Wed, 25-Sep-24 21:46:35 GMT; domain=.api.openai.com; HttpOnly;
-            Secure; SameSite=None
-          - _cfuvid=ipHIXWJzY6qksCWMRv8pk2ueHiG5GB8g2GBZbMGPsBc-1727298995230-0.0.1.1-604800000;
-            path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
         Transfer-Encoding:
           - chunked
         X-Content-Type-Options:
@@ -83,25 +78,25 @@ interactions:
         openai-organization:
           - future-house-xr4tdh
         openai-processing-ms:
-          - "133"
+          - "231"
         openai-version:
           - "2020-10-01"
         strict-transport-security:
           - max-age=31536000; includeSubDomains; preload
         x-ratelimit-limit-requests:
-          - "30000"
+          - "10000"
         x-ratelimit-limit-tokens:
-          - "150000000"
+          - "30000000"
         x-ratelimit-remaining-requests:
-          - "29999"
+          - "9999"
         x-ratelimit-remaining-tokens:
-          - "149999907"
+          - "29999909"
         x-ratelimit-reset-requests:
-          - 2ms
+          - 6ms
         x-ratelimit-reset-tokens:
           - 0s
         x-request-id:
-          - req_02646a0ce82be0e9b7240968034e1d6d
+          - req_6538a77713d1ec9b61a8e15f3cf37377
       status:
         code: 200
         message: OK
diff --git a/tests/test_utils.py b/tests/test_utils.py
index de46749c..3469f01c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,10 @@
+from collections.abc import Iterable, Sequence
+
 import pytest
 
 from aviary.core import eval_answer
+from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion
+from tests.conftest import VCR_DEFAULT_MATCH_ON
 
 
 @pytest.mark.vcr
@@ -40,3 +44,221 @@ async def test_eval_answer(
 async def test_eval_llm_config():
     config = {"temperature": 0.5}
     assert await eval_answer("250", "250", "What is 25 * 10?", "llm", config)
+
+
+class TestLitQAEvaluation:
+    @staticmethod
+    def _assert_prompt_is_valid(
+        mc_question: MultipleChoiceQuestion,
+        question: str,
+        ideal_answer: str,
+        distractors: Iterable[str],
+    ) -> None:
+        question_prompt = mc_question.question_prompt
+        for substr in (
+            question,
+            "Insufficient information",
+            ideal_answer,
+            *distractors,
+        ):
+            assert question_prompt.count(substr) == 1
+
+    # Use for general purpose testing
+    ZIP_CODE_QUESTION_IDEAL_DISTRACTORS = (
+        "What is my office's zip code?",
+        "94107",
+        ["-8", "94106", "cheesecake"],
+    )
+    # The following two are used to check we don't leak on the LLM's innate knowledge
+    MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS = (
+        "What is the meaning of life?",
+        "42",
+        ["-84", "11", "cheesecake"],
+    )
+    # Source: https://github.com/Future-House/LAB-Bench/blob/43b2045c67a2da12c233689cf538f1ed5c42f590/LitQA2/litqa-v2-public.jsonl#L130
+    LITQA2_QUESTION_IDEAL_DISTRACTORS = (
+        (
+            "What method was used to demonstrate that the enzyme PafA is stable after"
+            " incubation with 4M urea for 14 days?"
+        ),
+        "circular dichroism",
+        ["cryo EM", "x-ray crystallography", "NMR"],
+    )
+
+    @pytest.mark.asyncio
+    @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"])
+    @pytest.mark.parametrize(
+        (
+            "question",
+            "ideal_answer",
+            "distractors",
+            "actual_answer",
+            "expected_eval",
+            "expected_extracted_answer",
+        ),
+        [
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "the answer is 94107",
+                MultipleChoiceEvaluation.CORRECT,
+                "94107",
+                id="matched-correct-option",
+            ),
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "the answer is 14004",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="didnt-match-and-no-llm-innate-knowledge",
+            ),
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "the answer is 94106",
+                MultipleChoiceEvaluation.INCORRECT,
+                "94106",
+                id="matched-incorrect-option",
+            ),
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "Insufficient information",
+                MultipleChoiceEvaluation.UNSURE,
+                MultipleChoiceQuestion.DEFAULT_UNSURE_OPTION,
+                id="matched-unsure-option",
+            ),
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "the answer is 94106 or 94107",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="matched-several-options",
+            ),
+            pytest.param(
+                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
+                "",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="empty-answer1",
+            ),
+            pytest.param(
+                *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
+                "14",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="didnt-match-and-llm-has-innate-knowledge",
+            ),
+            pytest.param(
+                *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
+                "",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="empty-answer2",
+            ),
+            pytest.param(
+                *LITQA2_QUESTION_IDEAL_DISTRACTORS,
+                "",
+                MultipleChoiceEvaluation.INCORRECT,
+                "0",
+                id="empty-answer3",
+            ),
+        ],
+    )
+    async def test_grade(
+        self,
+        question: str,
+        ideal_answer: str,
+        distractors: str | list[str],
+        actual_answer: str,
+        expected_eval: MultipleChoiceEvaluation,
+        expected_extracted_answer: str,
+    ) -> None:
+        """Tests that we can create a multiple choice question and evaluate answers."""
+        mc_question = MultipleChoiceQuestion(
+            question=question,
+            options=distractors,
+            ideal_answer=ideal_answer,
+            shuffle_seed=42,  # Seed for VCR cassette
+        )
+        self._assert_prompt_is_valid(mc_question, question, ideal_answer, distractors)
+        evaluation, _, graded_answer = await mc_question.grade(actual_answer)
+        assert evaluation == expected_eval
+        if evaluation == MultipleChoiceEvaluation.CORRECT:
+            assert graded_answer == ideal_answer
+        assert graded_answer == expected_extracted_answer
+
+    def test_consistent_mc_options(self) -> None:
+        """Tests that creating multiple evaluations with the same seed results in the same prompt."""
+        question, ideal, distractors = self.MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS
+        mc_question_1a = MultipleChoiceQuestion(
+            question=question, ideal_answer=ideal, options=distractors, shuffle_seed=0
+        )
+        self._assert_prompt_is_valid(mc_question_1a, question, ideal, distractors)
+
+        mc_question_1b = MultipleChoiceQuestion(
+            question=question, ideal_answer=ideal, options=distractors, shuffle_seed=0
+        )
+        self._assert_prompt_is_valid(mc_question_1b, question, ideal, distractors)
+        assert mc_question_1a == mc_question_1b, (
+            "Same seeding should lead to same prompts"
+        )
+
+        mc_question_1a_copy = MultipleChoiceQuestion(**mc_question_1a.model_dump())
+        self._assert_prompt_is_valid(mc_question_1a_copy, question, ideal, distractors)
+        assert mc_question_1a == mc_question_1b, (
+            "Serialization then deserialization should lead to same prompts"
+        )
+
+        mc_question_2a = MultipleChoiceQuestion(
+            question=question,
+            ideal_answer=ideal,
+            options=distractors,
+            shuffle_seed=MultipleChoiceQuestion.SEED_USING_QUESTION,
+        )
+        self._assert_prompt_is_valid(mc_question_2a, question, ideal, distractors)
+
+        mc_question_2b = MultipleChoiceQuestion(
+            question=question,
+            ideal_answer=ideal,
+            options=distractors,
+            shuffle_seed=MultipleChoiceQuestion.SEED_USING_QUESTION,
+        )
+        self._assert_prompt_is_valid(mc_question_2b, question, ideal, distractors)
+        assert mc_question_2a == mc_question_2b, (
+            "Same seeding strategy should lead to same prompts"
+        )
+        assert mc_question_2a != mc_question_1a, (
+            "Different seeding strategies should lead to different prompts"
+        )
+
+
+class TestMultipleChoiceEvaluation:
+    @pytest.mark.parametrize(
+        ("evals", "accuracy_precision"),
+        [
+            (
+                [
+                    MultipleChoiceEvaluation.CORRECT,
+                    MultipleChoiceEvaluation.CORRECT,
+                    MultipleChoiceEvaluation.CORRECT,
+                ],
+                (1, 1),
+            ),
+            (["correct", "correct", "unsure"], (2 / 3, 1)),
+            (
+                [
+                    MultipleChoiceEvaluation.CORRECT,
+                    MultipleChoiceEvaluation.UNSURE,
+                    "incorrect",
+                ],
+                (1 / 3, 1 / 2),
+            ),
+        ],
+    )
+    def test_calculate_accuracy_precision(
+        self,
+        evals: Sequence[MultipleChoiceEvaluation],
+        accuracy_precision: tuple[float, float],
+    ) -> None:
+        assert (
+            MultipleChoiceEvaluation.calculate_accuracy_precision(evals)
+            == accuracy_precision
+        )