diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b91cacef..84fe21b3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -84,8 +84,8 @@ repos:
           - aiohttp>=3.10.6 # Match pyproject.toml
           - PyMuPDF>=1.24.12
           - anyio
-          - fhaviary[llm]>=0.10.2 # Match pyproject.toml
-          - ldp>=0.14.5 # Match pyproject.toml
+          - fhaviary[llm]>=0.14 # Match pyproject.toml
+          - ldp>=0.17 # Match pyproject.toml
           - html2text
           - fh-llm-client
           - httpx
diff --git a/paperqa/_ldp_shims.py b/paperqa/_ldp_shims.py
index 744e2d8a..02dfb13a 100644
--- a/paperqa/_ldp_shims.py
+++ b/paperqa/_ldp_shims.py
@@ -15,6 +15,7 @@
     "UIndexMemoryModel",
     "_Memories",
     "discounted_returns",
+    "evaluate_consensus",
     "set_training_mode",
 ]
 
@@ -29,7 +30,12 @@
         SimpleAgent,
         SimpleAgentState,
     )
-    from ldp.alg import Callback, ComputeTrajectoryMetricsMixin, RolloutManager
+    from ldp.alg import (
+        Callback,
+        ComputeTrajectoryMetricsMixin,
+        RolloutManager,
+        evaluate_consensus,
+    )
     from ldp.graph.memory import Memory, UIndexMemoryModel
     from ldp.graph.op_utils import set_training_mode
     from ldp.utils import discounted_returns
@@ -57,4 +63,5 @@ class Callback:  # type: ignore[no-redef]
     SimpleAgentState = None  # type: ignore[assignment,misc]
     UIndexMemoryModel = None  # type: ignore[assignment,misc]
     discounted_returns = None  # type: ignore[assignment]
+    evaluate_consensus = None  # type: ignore[assignment]
     set_training_mode = None  # type: ignore[assignment]
diff --git a/paperqa/agents/env.py b/paperqa/agents/env.py
index 7971bc86..fbdaf704 100644
--- a/paperqa/agents/env.py
+++ b/paperqa/agents/env.py
@@ -11,6 +11,7 @@
     ToolRequestMessage,
     ToolResponseMessage,
 )
+from aviary.utils import MultipleChoiceQuestion
 from llmclient import EmbeddingModel, LiteLLMModel
 
 from paperqa.docs import Docs
@@ -238,10 +239,11 @@ def make_initial_state(self) -> EnvironmentState:
         ):
             status_fn = clinical_trial_status
 
+        query: str | MultipleChoiceQuestion = self._query.query
         return EnvironmentState(
             docs=self._docs,
             session=PQASession(
-                question=self._query.query,
+                question=query if isinstance(query, str) else query.question_prompt,
                 config_md5=self._query.settings.md5,
                 id=self._query.id,
             ),
diff --git a/paperqa/agents/models.py b/paperqa/agents/models.py
index eaf366d6..906e8058 100644
--- a/paperqa/agents/models.py
+++ b/paperqa/agents/models.py
@@ -8,6 +8,7 @@
 from typing import Any, ClassVar, Protocol
 from uuid import UUID, uuid4
 
+from aviary.utils import MultipleChoiceQuestion
 from llmclient import LiteLLMModel, LLMModel
 from pydantic import (
     BaseModel,
@@ -55,7 +56,13 @@ class MismatchedModelsError(Exception):
 class QueryRequest(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
-    query: str = ""
+    query: str | MultipleChoiceQuestion = Field(
+        default="",
+        description=(
+            "The query to be answered. Set to a multiple choice question when grading"
+            " (e.g. for training)."
+        ),
+    )
     id: UUID = Field(
         default_factory=uuid4,
         description="Identifier which will be propagated to the Answer object.",
diff --git a/paperqa/agents/task.py b/paperqa/agents/task.py
index 96220146..0ae55a64 100644
--- a/paperqa/agents/task.py
+++ b/paperqa/agents/task.py
@@ -10,13 +10,14 @@
 import logging
 import re
 from abc import ABC
-from collections.abc import Awaitable, Callable, Mapping, Sequence
+from collections.abc import Awaitable, Callable, Iterable, Mapping, Sequence
 from copy import deepcopy
 from enum import StrEnum
-from typing import TYPE_CHECKING, Any, Self, assert_never
+from typing import TYPE_CHECKING, Any, Self, assert_never, cast
 
 from aviary.core import (
     TASK_DATASET_REGISTRY,
+    Environment,
     Frame,
     Messages,
     TaskDataset,
@@ -24,15 +25,22 @@
     ToolResponseMessage,
 )
 from aviary.env import ENV_REGISTRY
+from aviary.utils import (
+    DEFAULT_EVAL_MODEL_NAME,
+    MultipleChoiceEvaluation,
+    MultipleChoiceQuestion,
+)
 from llmclient import EmbeddingModel, LiteLLMModel, LLMModel
 
-from paperqa._ldp_shims import ComputeTrajectoryMetricsMixin
+from paperqa._ldp_shims import (
+    Callback,
+    ComputeTrajectoryMetricsMixin,
+    evaluate_consensus,
+)
 from paperqa.docs import Docs
 from paperqa.litqa import (
-    DEFAULT_EVAL_MODEL_NAME,
     DEFAULT_LABBENCH_HF_HUB_NAME,
     DEFAULT_REWARD_MAPPING,
-    LitQAEvaluation,
     read_litqa_v2_from_hub,
 )
 from paperqa.types import DocDetails, PQASession
@@ -40,10 +48,11 @@
 from .env import POPULATE_FROM_SETTINGS, PaperQAEnvironment
 from .models import QueryRequest
 from .search import SearchIndex, maybe_get_manifest
-from .tools import Complete
+from .tools import Complete, EnvironmentState
 
 if TYPE_CHECKING:
-    from ldp.data_structures import Trajectory
+    from ldp.agent import Agent
+    from ldp.data_structures import Trajectory, Transition
 
 logger = logging.getLogger(__name__)
 
@@ -58,26 +67,22 @@ def __init__(
         llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS,
         summary_llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS,
         embedding_model: EmbeddingModel | None = POPULATE_FROM_SETTINGS,
-        evaluation_from_answer: (
-            Callable[[PQASession | str], Awaitable[LitQAEvaluation]] | None
-        ) = None,
         sources: str | list[str] | None = None,
         rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
-        evaluation_callback: Callable[[LitQAEvaluation], Awaitable] | None = None,
+        evaluation_callback: (
+            Callable[[MultipleChoiceEvaluation], Awaitable] | None
+        ) = None,
         **env_kwargs,
     ):
         super().__init__(
             query, docs, llm_model, summary_llm_model, embedding_model, **env_kwargs
         )
-        self._evaluation_from_answer = evaluation_from_answer
         # Enables checking an Index has the right DOI(s)
         self.sources: list[str] | None = (
             [sources] if isinstance(sources, str) else sources
         )
         self._evaluation_callback = evaluation_callback
         self._rewards = rewards
-        self.answer = ""
-        self.ideal = ""
 
     async def validate_sources(
         self, manifest_or_index: dict[str, DocDetails] | SearchIndex | None = None
@@ -120,7 +125,7 @@ async def step(
         self, action: ToolRequestMessage
     ) -> tuple[Messages, float, bool, bool]:
         messages, reward, done, truncated = await super().step(action)
-        if not done or not self._evaluation_from_answer:
+        if not done or not isinstance(self._query.query, MultipleChoiceQuestion):
             return messages, reward, done, truncated
         # If the ensuring evaluation fails (e.g. due to OpenAI being down), we can:
         # - Suppress the exception and declare the evaluation as incorrect, which can
@@ -130,23 +135,13 @@ async def step(
         #   incorrectly reward what otherwise was a good trajectory.
         # - Don't suppress the exception, which leads to the trajectory failing, and
         #   removes it from the learnable pool. This is the only safe default behavior.
-        evaluation = await self._evaluation_from_answer(self.state.session.answer)
+        evaluation, self.state.session.graded_answer = await self._query.query.grade(
+            self.state.session.answer
+        )
         if evaluation_callback := self._evaluation_callback:
             await evaluation_callback(evaluation)
-        self.answer = evaluation.answer or ""
-        self.ideal = evaluation.ideal or ""
         return messages, reward + self._rewards[evaluation.value], done, truncated
 
-    def export_frame(self) -> Frame:
-        return Frame(
-            state=self.state,
-            info={
-                "query": self._query,
-                "answer": self.answer,
-                "ideal": self.ideal,
-            },
-        )
-
     def __deepcopy__(self, memo) -> Self:
         copy_state = deepcopy(self.state, memo)
         # We don't know the side effects of deep copying a litellm.Router,
@@ -162,7 +157,6 @@ def __deepcopy__(self, memo) -> Self:
         copy_self = type(self)(
             query=deepcopy(self._query, memo),  # deepcopy for _docs_name
             docs=copy_state.docs,
-            evaluation_from_answer=self._evaluation_from_answer,
             sources=self.sources,
             rewards=self._rewards,
             evaluation_callback=self._evaluation_callback,
@@ -182,6 +176,120 @@ def __deepcopy__(self, memo) -> Self:
 )
 
 
+async def evaluate_consensus_sampling(
+    data: Iterable[GradablePaperQAEnvironment | Frame],
+    exclude_no_answer: bool = False,
+    num_samples: int = 1,
+    seed: int | None = None,
+) -> tuple[dict[str, list[tuple[str, int]]], float]:
+    """
+    Create consensus groups based on question and evaluate the consensus for each.
+
+    Args:
+        data: Data to evaluate consensus upon, either gradable environments or frames.
+        exclude_no_answer: Opt-in flag to filter out empty answers (due to the
+            Environment/Frame not having a graded answer). Use of this flag does not
+            affect the accuracy term of the return.
+        num_samples: Passed through to evaluate_consensus.
+        seed: Passed through to evaluate_consensus.
+
+    Returns:
+        Two-tuple of consensus list generated by collections.Counter.most_common (keys
+            are question, values are list of (answer, vote count)) and the proportion of
+            groups for which the consensus matches the ideal.
+    """
+
+    def extract_question(x: GradablePaperQAEnvironment | Frame) -> str:
+        if isinstance(x, GradablePaperQAEnvironment):
+            query: str | MultipleChoiceQuestion | dict[str, Any] = x._query.query
+        else:
+            qr: QueryRequest | dict[str, Any] = x.info["query"]  # type: ignore[call-overload,index,assignment]
+            query = qr.query if isinstance(qr, QueryRequest) else qr["query"]
+        if isinstance(query, str):
+            return query
+        if isinstance(query, MultipleChoiceQuestion):
+            return query.question_prompt
+        return query["question"]
+
+    def extract_answer(x: GradablePaperQAEnvironment | Frame) -> str:
+        ses: PQASession | dict[str, Any] = (
+            x.state.session
+            if isinstance(x.state, EnvironmentState)
+            else cast(PQASession | dict[str, Any], x.state["session"])  # type: ignore[call-overload,index]
+        )
+        graded_answer = (
+            ses.graded_answer if isinstance(ses, PQASession) else ses["graded_answer"]
+        )
+        # One can filter the below empty string injection via the exclude_no_answer arg
+        return graded_answer or ""
+
+    def extract_ideal(x: GradablePaperQAEnvironment | Frame) -> str:
+        if isinstance(x, GradablePaperQAEnvironment):
+            query: str | MultipleChoiceQuestion | dict[str, Any] = x._query.query
+        else:
+            qr: QueryRequest | dict[str, Any] = x.info["query"]  # type: ignore[call-overload,index,assignment]
+            query = qr.query if isinstance(qr, QueryRequest) else qr["query"]
+        if isinstance(query, str):
+            raise ValueError(  # noqa: TRY004
+                f"We require a {MultipleChoiceQuestion.__name__} variant to extract"
+                " ideal answer, not a string."
+            )
+        if isinstance(query, MultipleChoiceQuestion):
+            return query.ideal_answer
+        return query["ideal_answer"]
+
+    try:
+        consensus, accuracy = await evaluate_consensus(
+            data=data,
+            grouping_fn=extract_question,
+            extract_answer_fn=extract_answer,
+            ideal_answer_fn=extract_ideal,
+            num_samples=num_samples,
+            seed=seed,
+        )
+    except TypeError:
+        raise ImportError(
+            "Evaluating consensus requires the 'ldp' extra for 'ldp'. Please:"
+            " `pip install paper-qa[ldp]`."
+        ) from None
+    if exclude_no_answer:
+        consensus = {
+            q: [(a, c) for a, c in answers if a] for q, answers in consensus.items()
+        }
+    return consensus, accuracy
+
+
+class StoreForConsensusSamplingCallback(Callback):
+    """Store environments or frames for later consensus sampling."""
+
+    def __init__(self):
+        super().__init__()
+        self.stored: list[GradablePaperQAEnvironment | Frame] = []
+
+    async def after_transition(
+        self,
+        traj_id: str,  # noqa: ARG002
+        agent: "Agent",  # noqa: ARG002
+        env: Environment,
+        transition: "Transition",
+    ) -> None:
+        if not isinstance(env, GradablePaperQAEnvironment):
+            raise NotImplementedError(
+                f"So far only handled {GradablePaperQAEnvironment} in this callback,"
+                f" not {type(env)}."
+            )
+        if transition.done and not transition.failed:  # Only store once
+            return
+        self.stored.append(env.export_frame())
+
+    async def evaluate_consensus_sampling(
+        self, num_samples: int = 1, seed: int | None = None
+    ) -> tuple[dict[str, list[tuple[str, int]]], float]:
+        return await evaluate_consensus_sampling(
+            data=self.stored, num_samples=num_samples, seed=seed
+        )
+
+
 class LitQATaskDataset(
     TaskDataset[GradablePaperQAEnvironment], ComputeTrajectoryMetricsMixin, ABC
 ):
@@ -218,24 +326,26 @@ def __init__(
 
     def _make_gradable_environment(
         self,
-        ideal: str,
+        ideal_answer: str,
         distractors: str | list[str],
         question: str,
         sources: str | list[str] | None = None,
     ) -> GradablePaperQAEnvironment:
-        qa_prompt, evaluation_from_answer = LitQAEvaluation.from_question(
-            ideal=ideal,
-            distractors=distractors,
+        mc_question = MultipleChoiceQuestion(
             question=question,
-            eval_model=self._eval_model,
+            options=(
+                distractors
+                if isinstance(distractors, list)
+                else MultipleChoiceQuestion.split_options(distractors)
+            ),
+            ideal_answer=ideal_answer,
             **(self._question_kwargs or {}),
         )
         query = self._base_query.model_copy()
-        query.query = qa_prompt
+        query.query = mc_question
         return GradablePaperQAEnvironment(
             query=query,
             docs=self._base_docs.model_copy(),
-            evaluation_from_answer=evaluation_from_answer,
             sources=sources,
             rewards=self._rewards,
             **self._env_kwargs,
@@ -338,7 +448,7 @@ def get_new_env_by_idx(self, idx: int) -> GradablePaperQAEnvironment:
                 ) from exc
             sources.append(doi)
         return self._make_gradable_environment(
-            ideal=self.data.iloc[idx].ideal,
+            ideal_answer=self.data.iloc[idx].ideal,
             distractors=self.data.iloc[idx].distractors,
             question=self.data.iloc[idx].question,
             sources=sources,
diff --git a/paperqa/litqa.py b/paperqa/litqa.py
index 2f5607af..878fe971 100644
--- a/paperqa/litqa.py
+++ b/paperqa/litqa.py
@@ -2,284 +2,38 @@
 
 from __future__ import annotations
 
-import random
-import re
-import string
-from ast import literal_eval
-from collections.abc import Awaitable, Callable, Mapping, Sequence
-from enum import StrEnum
-from typing import TYPE_CHECKING, Literal, Self
+from collections.abc import Mapping
+from typing import TYPE_CHECKING
 
-from aviary.core import Message
-from llmclient import LiteLLMModel, LLMModel
+from aviary.utils import MultipleChoiceEvaluation
 
 from paperqa._ldp_shims import discounted_returns
-from paperqa.prompts import EVAL_PROMPT_TEMPLATE, QA_PROMPT_TEMPLATE
-from paperqa.settings import make_default_litellm_model_list_settings
-from paperqa.types import PQASession
 
 if TYPE_CHECKING:
     import pandas as pd
 
-# Special case for LitQA, when ideal == "null"
-UNSURE_OPTION = "Insufficient information to answer this question"
-_CAPITAL_A_INDEX = ord("A")
 
-
-def make_mc_options(
-    ideal: str,
-    distractors: str | Sequence[str],
-    unsure_option: str | None = UNSURE_OPTION,
-    seed: int | None = None,
-) -> tuple[str, str, str | None, list[str]]:
-    r"""
-    Return string of options (as letters) and correct answer.
-
-    Examples:
-        >>> text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
-        ...     ideal="1", distractors=["0", "2", "Dog"], seed=0
-        ... )
-        >>> text
-        'A) Dog\nB) 2\nC) 0\nD) Insufficient information to answer this question\nE) 1'
-        >>> ideal_answer
-        'E'
-        >>> unsure_answer
-        'D'
-        >>> distractor_answers
-        ['C', 'B', 'A']
-    """
-    if isinstance(distractors, str):
-        try:
-            split_distractors = literal_eval(distractors)
-            if not isinstance(split_distractors, list):
-                raise TypeError("Need split_distractors to be a list.")  # noqa: TRY301
-        except (ValueError, SyntaxError, TypeError):
-            split_distractors = [d.strip("'[ ]\"") for d in distractors.split(",")]
-        distractors = split_distractors
-    # We are going to modify options in-place, so copy the distractors
-    options = [*distractors]
-
-    if ideal == "null":
-        if not unsure_option:
-            raise ValueError(
-                'Dataset configured for "unsure" options via '
-                'ideal="null", please specify "unsure_option".'
-            )
-        correct_answer = unsure_option
-    else:
-        # add the answer to the options, only if not null
-        options.append(ideal)
-        correct_answer = ideal
-
-    if unsure_option:
-        options.append(unsure_option)
-
-    if len(options) > len(string.ascii_lowercase):
-        raise NotImplementedError(
-            "Didn't handle more multiple choice options than letters, options were"
-            f" {options}."
-        )
-    random.Random(seed).shuffle(options)
-    return (
-        "\n".join([f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(options)]),
-        chr(_CAPITAL_A_INDEX + options.index(correct_answer)),
-        chr(_CAPITAL_A_INDEX + options.index(unsure_option)) if unsure_option else None,
-        [chr(_CAPITAL_A_INDEX + options.index(dstr)) for dstr in distractors],
-    )
-
-
-DEFAULT_EVAL_MODEL_NAME = "gpt-4-turbo-2024-04-09"
 DEFAULT_REWARD_MAPPING = {"correct": 1.0, "unsure": 0.1, "incorrect": -1.0}
-SEED_USING_QUESTION: Literal["SEED_USING_QUESTION"] = "SEED_USING_QUESTION"  # Sentinel
-
-
-class LitQAEvaluation(StrEnum):
-    """Possible evaluation results for a LitQA question and methods for working with answers."""
-
-    CORRECT = "correct"
-    INCORRECT = "incorrect"
-    UNSURE = "unsure"
-
-    @property
-    def answer(self) -> str | None:
-        return getattr(self, "_answer", None)
-
-    @answer.setter
-    def answer(self, value: str | None) -> None:
-        self._answer = value
-
-    @property
-    def ideal(self) -> str | None:
-        return getattr(self, "_ideal", None)
 
-    @ideal.setter
-    def ideal(self, value: str) -> None:
-        self._ideal = value
 
-    def make_discounted_returns(
-        self,
-        num_steps: int,
-        rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
-        discount: float = 1.0,
-    ) -> list[float]:
-        try:
-            return discounted_returns(
-                # paper-qa has no intermediary rewards
-                [0] * (num_steps - 1) + [rewards[self.value]],
-                terminated=[False] * (num_steps - 1) + [True],
-                discount=discount,
-            )
-        except TypeError as exc:
-            raise ImportError(
-                "Making discounted returns requires the 'ldp' extra for 'ldp'. Please:"
-                " `pip install paper-qa[ldp]`."
-            ) from exc
-
-    @classmethod
-    def from_answer(
-        cls,
-        text: str,
-        ideal_mc_answer: str,
-        unsure_mc_answer: str | None = None,
-        total_options: int | None = None,
-    ) -> LitQAEvaluation:
-        """Compare text with a multiple choice answer or optionally an unsure answer."""
-
-        def extract_answer(answer: str) -> str:
-            # first capital letter, like A or A)
-            s = re.search(r"([A-Z])\)?", answer, re.DOTALL)
-            if s is not None:
-                return s.group(1)
-            return answer.split()[0][0].upper()
-
-        result = extract_answer(text)
-        if (
-            total_options is not None
-            and ord(result[0]) - _CAPITAL_A_INDEX + 1 > total_options
-        ):
-            # The result extracted was not in the options
-            evaluation = cls.INCORRECT
-            evaluation.answer = result
-        # From here, if we don't match either the ideal or the unsure multiple choice
-        # options then we declare the answer as incorrect.
-        elif unsure_mc_answer and result[0].lower() == unsure_mc_answer[0].lower():
-            evaluation = cls.UNSURE
-            evaluation.answer = unsure_mc_answer
-        elif result[0].lower() == ideal_mc_answer[0].lower():
-            evaluation = cls.CORRECT
-            evaluation.answer = ideal_mc_answer
-        else:
-            evaluation = cls.INCORRECT
-            evaluation.answer = result
-        evaluation.ideal = ideal_mc_answer
-        return evaluation
-
-    @classmethod
-    def from_question(
-        cls,
-        ideal: str,
-        distractors: str | list[str],
-        question: str,
-        use_unsure: bool = True,
-        eval_model: LLMModel | str = DEFAULT_EVAL_MODEL_NAME,
-        seed: int | Literal["SEED_USING_QUESTION"] | None = None,
-    ) -> tuple[str, Callable[[PQASession | str], Awaitable[LitQAEvaluation]]]:
-        """
-        Create a LitQA question and an answer-to-evaluation function.
-
-        Args:
-            ideal: Ideal answer term's text (not a multiple choice letter).
-            distractors: Distractor terms' text (not multiple choice letters).
-            question: Question text.
-            use_unsure: Flag (default is enabled) to add an 'insufficient answer' term.
-            eval_model: Evaluation model to use for multiple choice letter extraction
-                from a text answer.
-            seed: Optional seed to use in randomization of multiple choice letters.
-                Optionally pass in the string literal "SEED_USING_QUESTION" to hash the
-                input question for the seed.
-
-        Returns:
-            Two-tuple of created LitQA question, function (that can be thought of as
-                stateless) to use to extract an evaluation result from an answer.
-        """
-        if seed == SEED_USING_QUESTION:
-            seed = hash(question)
-        text, ideal_answer, unsure_answer, distractor_answers = make_mc_options(
-            ideal=ideal,
-            distractors=distractors,
-            seed=seed,
-            **({} if use_unsure else {"unsure_option": None}),
-        )
-        qa_prompt = QA_PROMPT_TEMPLATE.format(question=question, options=text)
-
-        if isinstance(eval_model, str):
-            eval_model = LiteLLMModel(
-                name=eval_model,
-                config=make_default_litellm_model_list_settings(eval_model),
-            )
-
-        async def llm_from_answer(answer: PQASession | str) -> LitQAEvaluation:
-            if isinstance(answer, PQASession):
-                answer = answer.answer
-            eval_chunk = await eval_model.achat(
-                messages=[
-                    Message(
-                        role="user",
-                        content=EVAL_PROMPT_TEMPLATE.format(
-                            qa_prompt=qa_prompt, qa_answer=answer
-                        ),
-                    ),
-                ]
-            )
-            if not isinstance(eval_chunk.text, str):
-                raise NotImplementedError(
-                    f"Expected evaluation chunk to be a string, not {eval_chunk.text}."
-                )
-            evaluation = cls.from_answer(
-                text=eval_chunk.text,
-                ideal_mc_answer=ideal_answer,
-                unsure_mc_answer=unsure_answer,
-                total_options=len(distractor_answers) + (2 if use_unsure else 1),
-            )
-            # convert MC answers back to full text option so that it
-            # is meaningful
-            evaluation.ideal = ideal
-            if evaluation == cls.CORRECT:
-                evaluation.answer = ideal
-            elif evaluation == cls.UNSURE:
-                evaluation.answer = UNSURE_OPTION
-            else:
-                try:
-                    evaluation.answer = distractors[
-                        distractor_answers.index(evaluation.answer or "")
-                    ]
-                except ValueError:
-                    evaluation.answer = None
-            return evaluation
-
-        return qa_prompt, llm_from_answer
-
-    @classmethod
-    def calculate_accuracy_precision(
-        cls, evaluations: Sequence[Self | str]
-    ) -> tuple[float, float]:
-        """
-        Calculate LitQA-specific accuracy and precision metrics upon evaluations.
-
-        Raises:
-            ZeroDivisionError: if an empty input.
-
-        Returns:
-            Two-tuple of accuracy = (num correct) / (num questions) and
-                precision = (num correct) / ((num questions) - (num unsure)).
-        """
-        evaluations = [e if isinstance(e, cls) else cls(e) for e in evaluations]
-        num_correct = sum(e == cls.CORRECT for e in evaluations)
-        accuracy = num_correct / len(evaluations)
-        precision = num_correct / sum(
-            e in {cls.CORRECT, cls.INCORRECT} for e in evaluations
+def make_discounted_returns(
+    evaluation: MultipleChoiceEvaluation,
+    num_steps: int,
+    rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING,
+    discount: float = 1.0,
+) -> list[float]:
+    try:
+        return discounted_returns(
+            # paper-qa has no intermediary rewards
+            [0] * (num_steps - 1) + [rewards[evaluation.value]],
+            terminated=[False] * (num_steps - 1) + [True],
+            discount=discount,
         )
-        return accuracy, precision
+    except TypeError as exc:
+        raise ImportError(
+            "Making discounted returns requires the 'ldp' extra for 'ldp'. Please:"
+            " `pip install paper-qa[ldp]`."
+        ) from exc
 
 
 DEFAULT_LABBENCH_HF_HUB_NAME = "futurehouse/lab-bench"
diff --git a/paperqa/types.py b/paperqa/types.py
index 007caf88..db932b08 100644
--- a/paperqa/types.py
+++ b/paperqa/types.py
@@ -132,7 +132,19 @@ class PQASession(BaseModel):
     context: str = ""
     contexts: list[Context] = Field(default_factory=list)
     references: str = ""
-    formatted_answer: str = ""
+    formatted_answer: str = Field(
+        default="",
+        description=(
+            "Optional prettified answer that includes information like question and"
+            " citations."
+        ),
+    )
+    graded_answer: str | None = Field(
+        default=None,
+        description=(
+            "Optional graded answer, used for things like multiple choice questions."
+        ),
+    )
     cost: float = 0.0
     # Map model name to a two-item list of LLM prompt token counts
     # and LLM completion token counts
diff --git a/pyproject.toml b/pyproject.toml
index 4d1fb2de..e900cb06 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "aiohttp>=3.10.6",  # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
     "anyio",
     "fh-llm-client",
-    "fhaviary[llm]>=0.10.2",  # For tool execution concurrency
+    "fhaviary[llm]>=0.14",  # For MultipleChoiceQuestion
     "html2text",  # TODO: evaluate moving to an opt-in dependency
     "httpx",
     "numpy",
@@ -77,7 +77,7 @@ dev = [
     "vcrpy>=6",  # Pin for https://github.com/kevin1024/vcrpy/issues/884
 ]
 ldp = [
-    "ldp>=0.16.0",  # For Callback.before_rollout
+    "ldp>=0.17",  # For evaluate_consensus
 ]
 local = [
     "sentence-transformers",
diff --git a/tests/test_litqa.py b/tests/test_litqa.py
index 18e03f48..0a36cefc 100644
--- a/tests/test_litqa.py
+++ b/tests/test_litqa.py
@@ -1,249 +1,56 @@
-from collections.abc import Sequence
 from typing import cast
 
 import pytest
+from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion
 
-from paperqa.litqa import (
-    SEED_USING_QUESTION,
-    UNSURE_OPTION,
-    LitQAEvaluation,
-    read_litqa_v2_from_hub,
-)
-from tests.conftest import VCR_DEFAULT_MATCH_ON
-
+from paperqa.litqa import make_discounted_returns, read_litqa_v2_from_hub
 
-class TestLitQAEvaluation:
-    @staticmethod
-    def _assert_prompt_is_valid(
-        qa_prompt: str, question: str, ideal: str, distractors: Sequence[str]
-    ) -> None:
-        for substr in (question, "Insufficient information", ideal, *distractors):
-            assert qa_prompt.count(substr) == 1
 
-    # Use for general purpose testing
-    ZIP_CODE_QUESTION_IDEAL_DISTRACTORS = (
-        "What is my office's zip code?",
-        "94107",
-        ["-8", "94106", "cheesecake"],
-    )
-    # The following two are used to check we don't leak on the LLM's innate knowledge
-    MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS = (
-        "What is the meaning of life?",
-        "42",
-        ["-84", "11", "cheesecake"],
+@pytest.mark.parametrize(
+    ("evaluation", "expected_dreturns"),
+    [
+        (MultipleChoiceEvaluation.CORRECT, [0.25, 0.5, 1.0]),
+        (MultipleChoiceEvaluation.INCORRECT, [-0.25, -0.5, -1.0]),
+        (MultipleChoiceEvaluation.UNSURE, [0.025, 0.05, 0.1]),
+    ],
+)
+def test_make_discounted_returns(
+    evaluation: MultipleChoiceEvaluation, expected_dreturns: list[float]
+) -> None:
+    assert (
+        make_discounted_returns(evaluation, num_steps=3, discount=0.5)
+        == expected_dreturns
     )
-    # Source: https://github.com/Future-House/LAB-Bench/blob/43b2045c67a2da12c233689cf538f1ed5c42f590/LitQA2/litqa-v2-public.jsonl#L130
-    LITQA2_QUESTION_IDEAL_DISTRACTORS = (
+
+
+def test_creating_litqa_questions() -> None:
+    """Test making LitQA eval questions after downloading from Hugging Face Hub."""
+    _, eval_split = read_litqa_v2_from_hub(seed=42)
+    assert len(eval_split) > 3
+    assert [
+        MultipleChoiceQuestion(
+            question=cast(str, row.question),
+            options=cast(list[str], row.distractors),
+            ideal_answer=cast(str, row.ideal),
+            shuffle_seed=42,
+        ).question_prompt
+        for row in eval_split[:3].itertuples()
+    ] == [
         (
-            "What method was used to demonstrate that the enzyme PafA is stable after"
-            " incubation with 4M urea for 14 days?"
+            "Q: Which of the following mutations in yeast Pbs2 increases its"
+            " interaction with SH3?\n\nOptions:\nA) P97A\nB) N92S\nC) Insufficient"
+            " information to answer this question\nD) K85W\nE) N92H\nF) I87W\nG) S83F"
         ),
-        "circular dichroism",
-        ["cryo EM", "x-ray crystallography", "NMR"],
-    )
-
-    @pytest.mark.asyncio
-    @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"])
-    @pytest.mark.parametrize(
         (
-            "question",
-            "ideal",
-            "distractors",
-            "answer",
-            "expected_eval",
-            "expected_dreturns",
-            "extracted_answer",
+            "Q: What percentage of colorectal cancer-associated fibroblasts typically"
+            " survive at 2 weeks if cultured with the platinum-based chemotherapy"
+            " oxaliplatin?\n\nOptions:\nA) Insufficient information to answer this"
+            " question\nB) 0%\nC) 50-80%\nD) 20-50%\nE) 1-20%\nF) 80-99%"
         ),
-        [
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "the answer is 94107",
-                LitQAEvaluation.CORRECT,
-                [0.25, 0.5, 1.0],
-                "94107",
-                id="matched-correct-option",
-            ),
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "the answer is 14004",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="didnt-match-and-no-llm-innate-knowledge",
-            ),
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "the answer is 94106",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                "94106",
-                id="matched-incorrect-option",
-            ),
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "Insufficient information",
-                LitQAEvaluation.UNSURE,
-                [0.025, 0.05, 0.1],
-                UNSURE_OPTION,
-                id="matched-unsure-option",
-            ),
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "the answer is 94106 or 94107",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="matched-several-options",
-            ),
-            pytest.param(
-                *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS,
-                "",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="empty-answer1",
-            ),
-            pytest.param(
-                *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
-                "14",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="didnt-match-and-llm-has-innate-knowledge",
-            ),
-            pytest.param(
-                *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS,
-                "",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="empty-answer2",
-            ),
-            pytest.param(
-                *LITQA2_QUESTION_IDEAL_DISTRACTORS,
-                "",
-                LitQAEvaluation.INCORRECT,
-                [-0.25, -0.5, -1.0],
-                None,
-                id="empty-answer3",
-            ),
-        ],
-    )
-    async def test_from_question(
-        self,
-        question: str,
-        ideal: str,
-        distractors: str | list[str],
-        answer: str,
-        expected_eval: LitQAEvaluation,
-        expected_dreturns: list[float],
-        extracted_answer: str,
-    ) -> None:
-        """Tests that we can create a LitQA question and evaluate answers."""
-        qa_prompt, eval_fn = LitQAEvaluation.from_question(
-            ideal=ideal,
-            distractors=distractors,
-            question=question,
-            seed=42,  # Seed for VCR cassette
-        )
-        self._assert_prompt_is_valid(qa_prompt, question, ideal, distractors)
-
-        evaluation = await eval_fn(answer)
-        assert evaluation == expected_eval
-        if evaluation == LitQAEvaluation.CORRECT:
-            assert evaluation.answer == ideal
-        assert evaluation.answer == extracted_answer
-        assert evaluation.ideal == ideal
-        assert evaluation.make_discounted_returns(3, discount=0.5) == expected_dreturns
-
-    def test_consistent_mc_options(self) -> None:
-        """Tests that creating multiple evaluations with the same seed results in the same prompt."""
-        question, ideal, distractors = self.MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS
-
-        qa_prompt_1a, _ = LitQAEvaluation.from_question(
-            ideal=ideal, distractors=distractors, question=question, seed=0
-        )
-        self._assert_prompt_is_valid(qa_prompt_1a, question, ideal, distractors)
-
-        qa_prompt_1b, _ = LitQAEvaluation.from_question(
-            ideal=ideal, distractors=distractors, question=question, seed=0
-        )
-        self._assert_prompt_is_valid(qa_prompt_1b, question, ideal, distractors)
-        assert qa_prompt_1a == qa_prompt_1b, "Same seeding should lead to same prompts"
-
-        qa_prompt_2a, _ = LitQAEvaluation.from_question(
-            ideal=ideal,
-            distractors=distractors,
-            question=question,
-            seed=SEED_USING_QUESTION,
-        )
-        self._assert_prompt_is_valid(qa_prompt_2a, question, ideal, distractors)
-
-        qa_prompt_2b, _ = LitQAEvaluation.from_question(
-            ideal=ideal,
-            distractors=distractors,
-            question=question,
-            seed=SEED_USING_QUESTION,
-        )
-        self._assert_prompt_is_valid(qa_prompt_2b, question, ideal, distractors)
-        assert (
-            qa_prompt_2a == qa_prompt_2b
-        ), "Same seeding strategy should lead to same prompts"
-        assert (
-            qa_prompt_2a != qa_prompt_1a
-        ), "Different seeding strategies should lead to different prompts"
-
-    def test_creating_litqa_questions(self) -> None:
-        """Test making LitQA eval questions after downloading from Hugging Face Hub."""
-        _, eval_split = read_litqa_v2_from_hub(seed=42)
-        assert len(eval_split) > 3
-        assert [
-            LitQAEvaluation.from_question(
-                ideal=cast(str, row.ideal),
-                distractors=cast(list[str], row.distractors),
-                question=cast(str, row.question),
-                seed=42,
-            )[0]
-            for row in eval_split[:3].itertuples()
-        ] == [
-            (
-                "Q: Which of the following mutations in yeast Pbs2 increases its"
-                " interaction with SH3?\n\nOptions:\nA) S83F\nB) I87W\nC) N92H\nD) K85W\nE)"
-                " Insufficient information to answer this question\nF) N92S\nG) P97A"
-            ),
-            (
-                "Q: What percentage of colorectal cancer-associated fibroblasts typically"
-                " survive at 2 weeks if cultured with the platinum-based chemotherapy"
-                " oxaliplatin?\n\nOptions:\nA) 80-99%\nB) 1-20%\nC) 20-50%\nD) 50-80%\nE)"
-                " 0%\nF) Insufficient information to answer this question"
-            ),
-            (
-                "Q: Which of the following genes shows the greatest difference in gene"
-                " expression between homologous cell types in mouse and human"
-                " brain?\n\nOptions:\nA) Htr3a\nB) Htr5a\nC) Htr6\nD) Insufficient"
-                " information to answer this question\nE) Htr1d"
-            ),
-        ]
-
-    @pytest.mark.parametrize(
-        ("evals", "accuracy_precision"),
-        [
-            (
-                [
-                    LitQAEvaluation.CORRECT,
-                    LitQAEvaluation.CORRECT,
-                    LitQAEvaluation.CORRECT,
-                ],
-                (1, 1),
-            ),
-            (["correct", "correct", "unsure"], (2 / 3, 1)),
-            (
-                [LitQAEvaluation.CORRECT, LitQAEvaluation.UNSURE, "incorrect"],
-                (1 / 3, 1 / 2),
-            ),
-        ],
-    )
-    def test_calculate_accuracy_precision(
-        self, evals: Sequence[LitQAEvaluation], accuracy_precision: tuple[float, float]
-    ) -> None:
-        assert LitQAEvaluation.calculate_accuracy_precision(evals) == accuracy_precision
+        (
+            "Q: Which of the following genes shows the greatest difference in gene"
+            " expression between homologous cell types in mouse and human"
+            " brain?\n\nOptions:\nA) Htr1d\nB) Insufficient information to answer this"
+            " question\nC) Htr6\nD) Htr5a\nE) Htr3a"
+        ),
+    ]
diff --git a/tests/test_task.py b/tests/test_task.py
index e1f14d83..dfe7a452 100644
--- a/tests/test_task.py
+++ b/tests/test_task.py
@@ -5,6 +5,7 @@
 
 import pytest
 from aviary.env import TASK_DATASET_REGISTRY, TaskConfig, TaskDataset
+from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion
 from ldp.agent import SimpleAgent
 from ldp.alg.callbacks import Callback, MeanMetricsCallback, StoreTrajectoriesCallback
 from ldp.alg.runners import Evaluator, EvaluatorConfig
@@ -20,7 +21,7 @@
     LitQAv2TaskSplit,
 )
 from paperqa.agents.tools import GenerateAnswer
-from paperqa.litqa import DEFAULT_REWARD_MAPPING, SEED_USING_QUESTION, LitQAEvaluation
+from paperqa.litqa import DEFAULT_REWARD_MAPPING
 
 
 @pytest.fixture(name="base_query_request")
@@ -60,7 +61,7 @@ def __init__(self, *args, **kwargs):
 
     def get_new_env_by_idx(self, idx: int) -> GradablePaperQAEnvironment:
         return self._make_gradable_environment(
-            ideal=self.data[idx][0],
+            ideal_answer=self.data[idx][0],
             distractors=self.data[idx][1],
             question=self.data[idx][2],
             sources=self.data[idx][3],
@@ -87,7 +88,10 @@ def __init__(self):
         self.query_to_envs: dict[str, PaperQAEnvironment] = {}
 
     async def before_rollout(self, traj_id: str, env) -> None:  # noqa: ARG002
-        self.query_to_envs[env._query.query] = env
+        query: str | MultipleChoiceQuestion = env._query.query
+        self.query_to_envs[
+            query if isinstance(query, str) else query.question_prompt
+        ] = env
 
 
 class TestTaskDataset:
@@ -120,9 +124,9 @@ async def test___len__(
                 obs, _ = await env.reset()
                 assert (
                     "Q: SLC14A1 been identified as a specific marker for endothelial"
-                    " cells in which organ?\n\nOptions:\nA) heart\nB) eye\nC)"
-                    " prostate\nD) Insufficient information to answer this question\nE)"
-                    " liver" in (obs[0].content or "")
+                    " cells in which organ?\n\nOptions:\nA) liver\nB) eye\nC)"
+                    " prostate\nD) heart\nE) Insufficient information to answer this"
+                    " question" in (obs[0].content or "")
                 )
             assert env.sources, "Sources need to be accessible"
             assert isinstance(
@@ -159,7 +163,7 @@ async def test_evaluation(
                         "deleted_dockeys",
                     }
                 ),
-                "question_kwargs": {"seed": SEED_USING_QUESTION},
+                "question_kwargs": {"seed": MultipleChoiceQuestion.SEED_USING_QUESTION},
             },
         )
         # NOTE: set base_query after construction of the TaskConfig. because in
@@ -193,7 +197,10 @@ async def test_evaluation(
         assert metrics_callback.eval_means["reward"] > 0, "Expected some wins"
         correct_reward, incorrect_reward = (
             DEFAULT_REWARD_MAPPING[evaluation.value]
-            for evaluation in (LitQAEvaluation.CORRECT, LitQAEvaluation.INCORRECT)
+            for evaluation in (
+                MultipleChoiceEvaluation.CORRECT,
+                MultipleChoiceEvaluation.INCORRECT,
+            )
         )
         worst_case_reward_given_correct = (
             correct_reward * correct_percentage
diff --git a/uv.lock b/uv.lock
index 65519dd6..150522e2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -240,7 +240,7 @@ name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "colorama", marker = "platform_system == 'Windows'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
@@ -1643,7 +1643,7 @@ wheels = [
 
 [[package]]
 name = "paper-qa"
-version = "5.8.1.dev14+g919bf0c"
+version = "5.8.1.dev15+gf68b03e.d20250102"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },
@@ -1715,7 +1715,31 @@ zotero = [
 
 [package.dev-dependencies]
 dev = [
-    { name = "paper-qa", extra = ["dev"] },
+    { name = "datasets" },
+    { name = "ipython" },
+    { name = "ldp" },
+    { name = "mypy" },
+    { name = "pandas-stubs" },
+    { name = "pre-commit" },
+    { name = "pydantic" },
+    { name = "pylint-pydantic" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-recording" },
+    { name = "pytest-rerunfailures" },
+    { name = "pytest-subtests" },
+    { name = "pytest-sugar" },
+    { name = "pytest-timer", extra = ["colorama"] },
+    { name = "pytest-xdist" },
+    { name = "python-dotenv" },
+    { name = "pyzotero" },
+    { name = "qdrant-client" },
+    { name = "refurb" },
+    { name = "sentence-transformers" },
+    { name = "typeguard" },
+    { name = "types-pyyaml" },
+    { name = "types-setuptools" },
+    { name = "vcrpy" },
 ]
 
 [package.metadata]
@@ -1724,11 +1748,11 @@ requires-dist = [
     { name = "anyio" },
     { name = "datasets", marker = "extra == 'datasets'" },
     { name = "fh-llm-client" },
-    { name = "fhaviary", extras = ["llm"], specifier = ">=0.10.2" },
+    { name = "fhaviary", extras = ["llm"], specifier = ">=0.14" },
     { name = "html2text" },
     { name = "httpx" },
     { name = "ipython", marker = "extra == 'dev'", specifier = ">=8" },
-    { name = "ldp", marker = "extra == 'ldp'", specifier = ">=0.16.0" },
+    { name = "ldp", marker = "extra == 'ldp'", specifier = ">=0.17" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" },
     { name = "numpy" },
     { name = "pandas-stubs", marker = "extra == 'typing'" },
@@ -1860,7 +1884,7 @@ name = "portalocker"
 version = "2.10.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
+    { name = "pywin32", marker = "platform_system == 'Windows'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 }
 wheels = [
@@ -2170,7 +2194,7 @@ name = "pympler"
 version = "1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
+    { name = "pywin32", marker = "platform_system == 'Windows'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/dd/37/c384631908029676d8e7213dd956bb686af303a80db7afbc9be36bc49495/pympler-1.1.tar.gz", hash = "sha256:1eaa867cb8992c218430f1708fdaccda53df064144d1c5656b1e6f1ee6000424", size = 179954 }
 wheels = [
@@ -2935,21 +2959,21 @@ dependencies = [
     { name = "fsspec" },
     { name = "jinja2" },
     { name = "networkx" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
     { name = "setuptools", marker = "python_full_version >= '3.12'" },
     { name = "sympy" },
-    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
@@ -2969,7 +2993,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "colorama", marker = "platform_system == 'Windows'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [