diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b91cacef..84fe21b3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -84,8 +84,8 @@ repos: - aiohttp>=3.10.6 # Match pyproject.toml - PyMuPDF>=1.24.12 - anyio - - fhaviary[llm]>=0.10.2 # Match pyproject.toml - - ldp>=0.14.5 # Match pyproject.toml + - fhaviary[llm]>=0.14 # Match pyproject.toml + - ldp>=0.17 # Match pyproject.toml - html2text - fh-llm-client - httpx diff --git a/paperqa/_ldp_shims.py b/paperqa/_ldp_shims.py index 744e2d8a..02dfb13a 100644 --- a/paperqa/_ldp_shims.py +++ b/paperqa/_ldp_shims.py @@ -15,6 +15,7 @@ "UIndexMemoryModel", "_Memories", "discounted_returns", + "evaluate_consensus", "set_training_mode", ] @@ -29,7 +30,12 @@ SimpleAgent, SimpleAgentState, ) - from ldp.alg import Callback, ComputeTrajectoryMetricsMixin, RolloutManager + from ldp.alg import ( + Callback, + ComputeTrajectoryMetricsMixin, + RolloutManager, + evaluate_consensus, + ) from ldp.graph.memory import Memory, UIndexMemoryModel from ldp.graph.op_utils import set_training_mode from ldp.utils import discounted_returns @@ -57,4 +63,5 @@ class Callback: # type: ignore[no-redef] SimpleAgentState = None # type: ignore[assignment,misc] UIndexMemoryModel = None # type: ignore[assignment,misc] discounted_returns = None # type: ignore[assignment] + evaluate_consensus = None # type: ignore[assignment] set_training_mode = None # type: ignore[assignment] diff --git a/paperqa/agents/env.py b/paperqa/agents/env.py index 7971bc86..fbdaf704 100644 --- a/paperqa/agents/env.py +++ b/paperqa/agents/env.py @@ -11,6 +11,7 @@ ToolRequestMessage, ToolResponseMessage, ) +from aviary.utils import MultipleChoiceQuestion from llmclient import EmbeddingModel, LiteLLMModel from paperqa.docs import Docs @@ -238,10 +239,11 @@ def make_initial_state(self) -> EnvironmentState: ): status_fn = clinical_trial_status + query: str | MultipleChoiceQuestion = self._query.query return EnvironmentState( docs=self._docs, session=PQASession( - question=self._query.query, + question=query if isinstance(query, str) else query.question_prompt, config_md5=self._query.settings.md5, id=self._query.id, ), diff --git a/paperqa/agents/models.py b/paperqa/agents/models.py index eaf366d6..906e8058 100644 --- a/paperqa/agents/models.py +++ b/paperqa/agents/models.py @@ -8,6 +8,7 @@ from typing import Any, ClassVar, Protocol from uuid import UUID, uuid4 +from aviary.utils import MultipleChoiceQuestion from llmclient import LiteLLMModel, LLMModel from pydantic import ( BaseModel, @@ -55,7 +56,13 @@ class MismatchedModelsError(Exception): class QueryRequest(BaseModel): model_config = ConfigDict(extra="forbid") - query: str = "" + query: str | MultipleChoiceQuestion = Field( + default="", + description=( + "The query to be answered. Set to a multiple choice question when grading" + " (e.g. for training)." + ), + ) id: UUID = Field( default_factory=uuid4, description="Identifier which will be propagated to the Answer object.", diff --git a/paperqa/agents/task.py b/paperqa/agents/task.py index 96220146..0ae55a64 100644 --- a/paperqa/agents/task.py +++ b/paperqa/agents/task.py @@ -10,13 +10,14 @@ import logging import re from abc import ABC -from collections.abc import Awaitable, Callable, Mapping, Sequence +from collections.abc import Awaitable, Callable, Iterable, Mapping, Sequence from copy import deepcopy from enum import StrEnum -from typing import TYPE_CHECKING, Any, Self, assert_never +from typing import TYPE_CHECKING, Any, Self, assert_never, cast from aviary.core import ( TASK_DATASET_REGISTRY, + Environment, Frame, Messages, TaskDataset, @@ -24,15 +25,22 @@ ToolResponseMessage, ) from aviary.env import ENV_REGISTRY +from aviary.utils import ( + DEFAULT_EVAL_MODEL_NAME, + MultipleChoiceEvaluation, + MultipleChoiceQuestion, +) from llmclient import EmbeddingModel, LiteLLMModel, LLMModel -from paperqa._ldp_shims import ComputeTrajectoryMetricsMixin +from paperqa._ldp_shims import ( + Callback, + ComputeTrajectoryMetricsMixin, + evaluate_consensus, +) from paperqa.docs import Docs from paperqa.litqa import ( - DEFAULT_EVAL_MODEL_NAME, DEFAULT_LABBENCH_HF_HUB_NAME, DEFAULT_REWARD_MAPPING, - LitQAEvaluation, read_litqa_v2_from_hub, ) from paperqa.types import DocDetails, PQASession @@ -40,10 +48,11 @@ from .env import POPULATE_FROM_SETTINGS, PaperQAEnvironment from .models import QueryRequest from .search import SearchIndex, maybe_get_manifest -from .tools import Complete +from .tools import Complete, EnvironmentState if TYPE_CHECKING: - from ldp.data_structures import Trajectory + from ldp.agent import Agent + from ldp.data_structures import Trajectory, Transition logger = logging.getLogger(__name__) @@ -58,26 +67,22 @@ def __init__( llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS, summary_llm_model: LiteLLMModel | None = POPULATE_FROM_SETTINGS, embedding_model: EmbeddingModel | None = POPULATE_FROM_SETTINGS, - evaluation_from_answer: ( - Callable[[PQASession | str], Awaitable[LitQAEvaluation]] | None - ) = None, sources: str | list[str] | None = None, rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING, - evaluation_callback: Callable[[LitQAEvaluation], Awaitable] | None = None, + evaluation_callback: ( + Callable[[MultipleChoiceEvaluation], Awaitable] | None + ) = None, **env_kwargs, ): super().__init__( query, docs, llm_model, summary_llm_model, embedding_model, **env_kwargs ) - self._evaluation_from_answer = evaluation_from_answer # Enables checking an Index has the right DOI(s) self.sources: list[str] | None = ( [sources] if isinstance(sources, str) else sources ) self._evaluation_callback = evaluation_callback self._rewards = rewards - self.answer = "" - self.ideal = "" async def validate_sources( self, manifest_or_index: dict[str, DocDetails] | SearchIndex | None = None @@ -120,7 +125,7 @@ async def step( self, action: ToolRequestMessage ) -> tuple[Messages, float, bool, bool]: messages, reward, done, truncated = await super().step(action) - if not done or not self._evaluation_from_answer: + if not done or not isinstance(self._query.query, MultipleChoiceQuestion): return messages, reward, done, truncated # If the ensuring evaluation fails (e.g. due to OpenAI being down), we can: # - Suppress the exception and declare the evaluation as incorrect, which can @@ -130,23 +135,13 @@ async def step( # incorrectly reward what otherwise was a good trajectory. # - Don't suppress the exception, which leads to the trajectory failing, and # removes it from the learnable pool. This is the only safe default behavior. - evaluation = await self._evaluation_from_answer(self.state.session.answer) + evaluation, self.state.session.graded_answer = await self._query.query.grade( + self.state.session.answer + ) if evaluation_callback := self._evaluation_callback: await evaluation_callback(evaluation) - self.answer = evaluation.answer or "" - self.ideal = evaluation.ideal or "" return messages, reward + self._rewards[evaluation.value], done, truncated - def export_frame(self) -> Frame: - return Frame( - state=self.state, - info={ - "query": self._query, - "answer": self.answer, - "ideal": self.ideal, - }, - ) - def __deepcopy__(self, memo) -> Self: copy_state = deepcopy(self.state, memo) # We don't know the side effects of deep copying a litellm.Router, @@ -162,7 +157,6 @@ def __deepcopy__(self, memo) -> Self: copy_self = type(self)( query=deepcopy(self._query, memo), # deepcopy for _docs_name docs=copy_state.docs, - evaluation_from_answer=self._evaluation_from_answer, sources=self.sources, rewards=self._rewards, evaluation_callback=self._evaluation_callback, @@ -182,6 +176,120 @@ def __deepcopy__(self, memo) -> Self: ) +async def evaluate_consensus_sampling( + data: Iterable[GradablePaperQAEnvironment | Frame], + exclude_no_answer: bool = False, + num_samples: int = 1, + seed: int | None = None, +) -> tuple[dict[str, list[tuple[str, int]]], float]: + """ + Create consensus groups based on question and evaluate the consensus for each. + + Args: + data: Data to evaluate consensus upon, either gradable environments or frames. + exclude_no_answer: Opt-in flag to filter out empty answers (due to the + Environment/Frame not having a graded answer). Use of this flag does not + affect the accuracy term of the return. + num_samples: Passed through to evaluate_consensus. + seed: Passed through to evaluate_consensus. + + Returns: + Two-tuple of consensus list generated by collections.Counter.most_common (keys + are question, values are list of (answer, vote count)) and the proportion of + groups for which the consensus matches the ideal. + """ + + def extract_question(x: GradablePaperQAEnvironment | Frame) -> str: + if isinstance(x, GradablePaperQAEnvironment): + query: str | MultipleChoiceQuestion | dict[str, Any] = x._query.query + else: + qr: QueryRequest | dict[str, Any] = x.info["query"] # type: ignore[call-overload,index,assignment] + query = qr.query if isinstance(qr, QueryRequest) else qr["query"] + if isinstance(query, str): + return query + if isinstance(query, MultipleChoiceQuestion): + return query.question_prompt + return query["question"] + + def extract_answer(x: GradablePaperQAEnvironment | Frame) -> str: + ses: PQASession | dict[str, Any] = ( + x.state.session + if isinstance(x.state, EnvironmentState) + else cast(PQASession | dict[str, Any], x.state["session"]) # type: ignore[call-overload,index] + ) + graded_answer = ( + ses.graded_answer if isinstance(ses, PQASession) else ses["graded_answer"] + ) + # One can filter the below empty string injection via the exclude_no_answer arg + return graded_answer or "" + + def extract_ideal(x: GradablePaperQAEnvironment | Frame) -> str: + if isinstance(x, GradablePaperQAEnvironment): + query: str | MultipleChoiceQuestion | dict[str, Any] = x._query.query + else: + qr: QueryRequest | dict[str, Any] = x.info["query"] # type: ignore[call-overload,index,assignment] + query = qr.query if isinstance(qr, QueryRequest) else qr["query"] + if isinstance(query, str): + raise ValueError( # noqa: TRY004 + f"We require a {MultipleChoiceQuestion.__name__} variant to extract" + " ideal answer, not a string." + ) + if isinstance(query, MultipleChoiceQuestion): + return query.ideal_answer + return query["ideal_answer"] + + try: + consensus, accuracy = await evaluate_consensus( + data=data, + grouping_fn=extract_question, + extract_answer_fn=extract_answer, + ideal_answer_fn=extract_ideal, + num_samples=num_samples, + seed=seed, + ) + except TypeError: + raise ImportError( + "Evaluating consensus requires the 'ldp' extra for 'ldp'. Please:" + " `pip install paper-qa[ldp]`." + ) from None + if exclude_no_answer: + consensus = { + q: [(a, c) for a, c in answers if a] for q, answers in consensus.items() + } + return consensus, accuracy + + +class StoreForConsensusSamplingCallback(Callback): + """Store environments or frames for later consensus sampling.""" + + def __init__(self): + super().__init__() + self.stored: list[GradablePaperQAEnvironment | Frame] = [] + + async def after_transition( + self, + traj_id: str, # noqa: ARG002 + agent: "Agent", # noqa: ARG002 + env: Environment, + transition: "Transition", + ) -> None: + if not isinstance(env, GradablePaperQAEnvironment): + raise NotImplementedError( + f"So far only handled {GradablePaperQAEnvironment} in this callback," + f" not {type(env)}." + ) + if transition.done and not transition.failed: # Only store once + return + self.stored.append(env.export_frame()) + + async def evaluate_consensus_sampling( + self, num_samples: int = 1, seed: int | None = None + ) -> tuple[dict[str, list[tuple[str, int]]], float]: + return await evaluate_consensus_sampling( + data=self.stored, num_samples=num_samples, seed=seed + ) + + class LitQATaskDataset( TaskDataset[GradablePaperQAEnvironment], ComputeTrajectoryMetricsMixin, ABC ): @@ -218,24 +326,26 @@ def __init__( def _make_gradable_environment( self, - ideal: str, + ideal_answer: str, distractors: str | list[str], question: str, sources: str | list[str] | None = None, ) -> GradablePaperQAEnvironment: - qa_prompt, evaluation_from_answer = LitQAEvaluation.from_question( - ideal=ideal, - distractors=distractors, + mc_question = MultipleChoiceQuestion( question=question, - eval_model=self._eval_model, + options=( + distractors + if isinstance(distractors, list) + else MultipleChoiceQuestion.split_options(distractors) + ), + ideal_answer=ideal_answer, **(self._question_kwargs or {}), ) query = self._base_query.model_copy() - query.query = qa_prompt + query.query = mc_question return GradablePaperQAEnvironment( query=query, docs=self._base_docs.model_copy(), - evaluation_from_answer=evaluation_from_answer, sources=sources, rewards=self._rewards, **self._env_kwargs, @@ -338,7 +448,7 @@ def get_new_env_by_idx(self, idx: int) -> GradablePaperQAEnvironment: ) from exc sources.append(doi) return self._make_gradable_environment( - ideal=self.data.iloc[idx].ideal, + ideal_answer=self.data.iloc[idx].ideal, distractors=self.data.iloc[idx].distractors, question=self.data.iloc[idx].question, sources=sources, diff --git a/paperqa/litqa.py b/paperqa/litqa.py index 2f5607af..878fe971 100644 --- a/paperqa/litqa.py +++ b/paperqa/litqa.py @@ -2,284 +2,38 @@ from __future__ import annotations -import random -import re -import string -from ast import literal_eval -from collections.abc import Awaitable, Callable, Mapping, Sequence -from enum import StrEnum -from typing import TYPE_CHECKING, Literal, Self +from collections.abc import Mapping +from typing import TYPE_CHECKING -from aviary.core import Message -from llmclient import LiteLLMModel, LLMModel +from aviary.utils import MultipleChoiceEvaluation from paperqa._ldp_shims import discounted_returns -from paperqa.prompts import EVAL_PROMPT_TEMPLATE, QA_PROMPT_TEMPLATE -from paperqa.settings import make_default_litellm_model_list_settings -from paperqa.types import PQASession if TYPE_CHECKING: import pandas as pd -# Special case for LitQA, when ideal == "null" -UNSURE_OPTION = "Insufficient information to answer this question" -_CAPITAL_A_INDEX = ord("A") - -def make_mc_options( - ideal: str, - distractors: str | Sequence[str], - unsure_option: str | None = UNSURE_OPTION, - seed: int | None = None, -) -> tuple[str, str, str | None, list[str]]: - r""" - Return string of options (as letters) and correct answer. - - Examples: - >>> text, ideal_answer, unsure_answer, distractor_answers = make_mc_options( - ... ideal="1", distractors=["0", "2", "Dog"], seed=0 - ... ) - >>> text - 'A) Dog\nB) 2\nC) 0\nD) Insufficient information to answer this question\nE) 1' - >>> ideal_answer - 'E' - >>> unsure_answer - 'D' - >>> distractor_answers - ['C', 'B', 'A'] - """ - if isinstance(distractors, str): - try: - split_distractors = literal_eval(distractors) - if not isinstance(split_distractors, list): - raise TypeError("Need split_distractors to be a list.") # noqa: TRY301 - except (ValueError, SyntaxError, TypeError): - split_distractors = [d.strip("'[ ]\"") for d in distractors.split(",")] - distractors = split_distractors - # We are going to modify options in-place, so copy the distractors - options = [*distractors] - - if ideal == "null": - if not unsure_option: - raise ValueError( - 'Dataset configured for "unsure" options via ' - 'ideal="null", please specify "unsure_option".' - ) - correct_answer = unsure_option - else: - # add the answer to the options, only if not null - options.append(ideal) - correct_answer = ideal - - if unsure_option: - options.append(unsure_option) - - if len(options) > len(string.ascii_lowercase): - raise NotImplementedError( - "Didn't handle more multiple choice options than letters, options were" - f" {options}." - ) - random.Random(seed).shuffle(options) - return ( - "\n".join([f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(options)]), - chr(_CAPITAL_A_INDEX + options.index(correct_answer)), - chr(_CAPITAL_A_INDEX + options.index(unsure_option)) if unsure_option else None, - [chr(_CAPITAL_A_INDEX + options.index(dstr)) for dstr in distractors], - ) - - -DEFAULT_EVAL_MODEL_NAME = "gpt-4-turbo-2024-04-09" DEFAULT_REWARD_MAPPING = {"correct": 1.0, "unsure": 0.1, "incorrect": -1.0} -SEED_USING_QUESTION: Literal["SEED_USING_QUESTION"] = "SEED_USING_QUESTION" # Sentinel - - -class LitQAEvaluation(StrEnum): - """Possible evaluation results for a LitQA question and methods for working with answers.""" - - CORRECT = "correct" - INCORRECT = "incorrect" - UNSURE = "unsure" - - @property - def answer(self) -> str | None: - return getattr(self, "_answer", None) - - @answer.setter - def answer(self, value: str | None) -> None: - self._answer = value - - @property - def ideal(self) -> str | None: - return getattr(self, "_ideal", None) - @ideal.setter - def ideal(self, value: str) -> None: - self._ideal = value - def make_discounted_returns( - self, - num_steps: int, - rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING, - discount: float = 1.0, - ) -> list[float]: - try: - return discounted_returns( - # paper-qa has no intermediary rewards - [0] * (num_steps - 1) + [rewards[self.value]], - terminated=[False] * (num_steps - 1) + [True], - discount=discount, - ) - except TypeError as exc: - raise ImportError( - "Making discounted returns requires the 'ldp' extra for 'ldp'. Please:" - " `pip install paper-qa[ldp]`." - ) from exc - - @classmethod - def from_answer( - cls, - text: str, - ideal_mc_answer: str, - unsure_mc_answer: str | None = None, - total_options: int | None = None, - ) -> LitQAEvaluation: - """Compare text with a multiple choice answer or optionally an unsure answer.""" - - def extract_answer(answer: str) -> str: - # first capital letter, like A or A) - s = re.search(r"([A-Z])\)?", answer, re.DOTALL) - if s is not None: - return s.group(1) - return answer.split()[0][0].upper() - - result = extract_answer(text) - if ( - total_options is not None - and ord(result[0]) - _CAPITAL_A_INDEX + 1 > total_options - ): - # The result extracted was not in the options - evaluation = cls.INCORRECT - evaluation.answer = result - # From here, if we don't match either the ideal or the unsure multiple choice - # options then we declare the answer as incorrect. - elif unsure_mc_answer and result[0].lower() == unsure_mc_answer[0].lower(): - evaluation = cls.UNSURE - evaluation.answer = unsure_mc_answer - elif result[0].lower() == ideal_mc_answer[0].lower(): - evaluation = cls.CORRECT - evaluation.answer = ideal_mc_answer - else: - evaluation = cls.INCORRECT - evaluation.answer = result - evaluation.ideal = ideal_mc_answer - return evaluation - - @classmethod - def from_question( - cls, - ideal: str, - distractors: str | list[str], - question: str, - use_unsure: bool = True, - eval_model: LLMModel | str = DEFAULT_EVAL_MODEL_NAME, - seed: int | Literal["SEED_USING_QUESTION"] | None = None, - ) -> tuple[str, Callable[[PQASession | str], Awaitable[LitQAEvaluation]]]: - """ - Create a LitQA question and an answer-to-evaluation function. - - Args: - ideal: Ideal answer term's text (not a multiple choice letter). - distractors: Distractor terms' text (not multiple choice letters). - question: Question text. - use_unsure: Flag (default is enabled) to add an 'insufficient answer' term. - eval_model: Evaluation model to use for multiple choice letter extraction - from a text answer. - seed: Optional seed to use in randomization of multiple choice letters. - Optionally pass in the string literal "SEED_USING_QUESTION" to hash the - input question for the seed. - - Returns: - Two-tuple of created LitQA question, function (that can be thought of as - stateless) to use to extract an evaluation result from an answer. - """ - if seed == SEED_USING_QUESTION: - seed = hash(question) - text, ideal_answer, unsure_answer, distractor_answers = make_mc_options( - ideal=ideal, - distractors=distractors, - seed=seed, - **({} if use_unsure else {"unsure_option": None}), - ) - qa_prompt = QA_PROMPT_TEMPLATE.format(question=question, options=text) - - if isinstance(eval_model, str): - eval_model = LiteLLMModel( - name=eval_model, - config=make_default_litellm_model_list_settings(eval_model), - ) - - async def llm_from_answer(answer: PQASession | str) -> LitQAEvaluation: - if isinstance(answer, PQASession): - answer = answer.answer - eval_chunk = await eval_model.achat( - messages=[ - Message( - role="user", - content=EVAL_PROMPT_TEMPLATE.format( - qa_prompt=qa_prompt, qa_answer=answer - ), - ), - ] - ) - if not isinstance(eval_chunk.text, str): - raise NotImplementedError( - f"Expected evaluation chunk to be a string, not {eval_chunk.text}." - ) - evaluation = cls.from_answer( - text=eval_chunk.text, - ideal_mc_answer=ideal_answer, - unsure_mc_answer=unsure_answer, - total_options=len(distractor_answers) + (2 if use_unsure else 1), - ) - # convert MC answers back to full text option so that it - # is meaningful - evaluation.ideal = ideal - if evaluation == cls.CORRECT: - evaluation.answer = ideal - elif evaluation == cls.UNSURE: - evaluation.answer = UNSURE_OPTION - else: - try: - evaluation.answer = distractors[ - distractor_answers.index(evaluation.answer or "") - ] - except ValueError: - evaluation.answer = None - return evaluation - - return qa_prompt, llm_from_answer - - @classmethod - def calculate_accuracy_precision( - cls, evaluations: Sequence[Self | str] - ) -> tuple[float, float]: - """ - Calculate LitQA-specific accuracy and precision metrics upon evaluations. - - Raises: - ZeroDivisionError: if an empty input. - - Returns: - Two-tuple of accuracy = (num correct) / (num questions) and - precision = (num correct) / ((num questions) - (num unsure)). - """ - evaluations = [e if isinstance(e, cls) else cls(e) for e in evaluations] - num_correct = sum(e == cls.CORRECT for e in evaluations) - accuracy = num_correct / len(evaluations) - precision = num_correct / sum( - e in {cls.CORRECT, cls.INCORRECT} for e in evaluations +def make_discounted_returns( + evaluation: MultipleChoiceEvaluation, + num_steps: int, + rewards: Mapping[str, float] = DEFAULT_REWARD_MAPPING, + discount: float = 1.0, +) -> list[float]: + try: + return discounted_returns( + # paper-qa has no intermediary rewards + [0] * (num_steps - 1) + [rewards[evaluation.value]], + terminated=[False] * (num_steps - 1) + [True], + discount=discount, ) - return accuracy, precision + except TypeError as exc: + raise ImportError( + "Making discounted returns requires the 'ldp' extra for 'ldp'. Please:" + " `pip install paper-qa[ldp]`." + ) from exc DEFAULT_LABBENCH_HF_HUB_NAME = "futurehouse/lab-bench" diff --git a/paperqa/types.py b/paperqa/types.py index 007caf88..db932b08 100644 --- a/paperqa/types.py +++ b/paperqa/types.py @@ -132,7 +132,19 @@ class PQASession(BaseModel): context: str = "" contexts: list[Context] = Field(default_factory=list) references: str = "" - formatted_answer: str = "" + formatted_answer: str = Field( + default="", + description=( + "Optional prettified answer that includes information like question and" + " citations." + ), + ) + graded_answer: str | None = Field( + default=None, + description=( + "Optional graded answer, used for things like multiple choice questions." + ), + ) cost: float = 0.0 # Map model name to a two-item list of LLM prompt token counts # and LLM completion token counts diff --git a/pyproject.toml b/pyproject.toml index 4d1fb2de..e900cb06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ dependencies = [ "aiohttp>=3.10.6", # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError "anyio", "fh-llm-client", - "fhaviary[llm]>=0.10.2", # For tool execution concurrency + "fhaviary[llm]>=0.14", # For MultipleChoiceQuestion "html2text", # TODO: evaluate moving to an opt-in dependency "httpx", "numpy", @@ -77,7 +77,7 @@ dev = [ "vcrpy>=6", # Pin for https://github.com/kevin1024/vcrpy/issues/884 ] ldp = [ - "ldp>=0.16.0", # For Callback.before_rollout + "ldp>=0.17", # For evaluate_consensus ] local = [ "sentence-transformers", diff --git a/tests/test_litqa.py b/tests/test_litqa.py index 18e03f48..0a36cefc 100644 --- a/tests/test_litqa.py +++ b/tests/test_litqa.py @@ -1,249 +1,56 @@ -from collections.abc import Sequence from typing import cast import pytest +from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion -from paperqa.litqa import ( - SEED_USING_QUESTION, - UNSURE_OPTION, - LitQAEvaluation, - read_litqa_v2_from_hub, -) -from tests.conftest import VCR_DEFAULT_MATCH_ON - +from paperqa.litqa import make_discounted_returns, read_litqa_v2_from_hub -class TestLitQAEvaluation: - @staticmethod - def _assert_prompt_is_valid( - qa_prompt: str, question: str, ideal: str, distractors: Sequence[str] - ) -> None: - for substr in (question, "Insufficient information", ideal, *distractors): - assert qa_prompt.count(substr) == 1 - # Use for general purpose testing - ZIP_CODE_QUESTION_IDEAL_DISTRACTORS = ( - "What is my office's zip code?", - "94107", - ["-8", "94106", "cheesecake"], - ) - # The following two are used to check we don't leak on the LLM's innate knowledge - MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS = ( - "What is the meaning of life?", - "42", - ["-84", "11", "cheesecake"], +@pytest.mark.parametrize( + ("evaluation", "expected_dreturns"), + [ + (MultipleChoiceEvaluation.CORRECT, [0.25, 0.5, 1.0]), + (MultipleChoiceEvaluation.INCORRECT, [-0.25, -0.5, -1.0]), + (MultipleChoiceEvaluation.UNSURE, [0.025, 0.05, 0.1]), + ], +) +def test_make_discounted_returns( + evaluation: MultipleChoiceEvaluation, expected_dreturns: list[float] +) -> None: + assert ( + make_discounted_returns(evaluation, num_steps=3, discount=0.5) + == expected_dreturns ) - # Source: https://github.com/Future-House/LAB-Bench/blob/43b2045c67a2da12c233689cf538f1ed5c42f590/LitQA2/litqa-v2-public.jsonl#L130 - LITQA2_QUESTION_IDEAL_DISTRACTORS = ( + + +def test_creating_litqa_questions() -> None: + """Test making LitQA eval questions after downloading from Hugging Face Hub.""" + _, eval_split = read_litqa_v2_from_hub(seed=42) + assert len(eval_split) > 3 + assert [ + MultipleChoiceQuestion( + question=cast(str, row.question), + options=cast(list[str], row.distractors), + ideal_answer=cast(str, row.ideal), + shuffle_seed=42, + ).question_prompt + for row in eval_split[:3].itertuples() + ] == [ ( - "What method was used to demonstrate that the enzyme PafA is stable after" - " incubation with 4M urea for 14 days?" + "Q: Which of the following mutations in yeast Pbs2 increases its" + " interaction with SH3?\n\nOptions:\nA) P97A\nB) N92S\nC) Insufficient" + " information to answer this question\nD) K85W\nE) N92H\nF) I87W\nG) S83F" ), - "circular dichroism", - ["cryo EM", "x-ray crystallography", "NMR"], - ) - - @pytest.mark.asyncio - @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"]) - @pytest.mark.parametrize( ( - "question", - "ideal", - "distractors", - "answer", - "expected_eval", - "expected_dreturns", - "extracted_answer", + "Q: What percentage of colorectal cancer-associated fibroblasts typically" + " survive at 2 weeks if cultured with the platinum-based chemotherapy" + " oxaliplatin?\n\nOptions:\nA) Insufficient information to answer this" + " question\nB) 0%\nC) 50-80%\nD) 20-50%\nE) 1-20%\nF) 80-99%" ), - [ - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "the answer is 94107", - LitQAEvaluation.CORRECT, - [0.25, 0.5, 1.0], - "94107", - id="matched-correct-option", - ), - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "the answer is 14004", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="didnt-match-and-no-llm-innate-knowledge", - ), - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "the answer is 94106", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - "94106", - id="matched-incorrect-option", - ), - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "Insufficient information", - LitQAEvaluation.UNSURE, - [0.025, 0.05, 0.1], - UNSURE_OPTION, - id="matched-unsure-option", - ), - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "the answer is 94106 or 94107", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="matched-several-options", - ), - pytest.param( - *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, - "", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="empty-answer1", - ), - pytest.param( - *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, - "14", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="didnt-match-and-llm-has-innate-knowledge", - ), - pytest.param( - *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, - "", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="empty-answer2", - ), - pytest.param( - *LITQA2_QUESTION_IDEAL_DISTRACTORS, - "", - LitQAEvaluation.INCORRECT, - [-0.25, -0.5, -1.0], - None, - id="empty-answer3", - ), - ], - ) - async def test_from_question( - self, - question: str, - ideal: str, - distractors: str | list[str], - answer: str, - expected_eval: LitQAEvaluation, - expected_dreturns: list[float], - extracted_answer: str, - ) -> None: - """Tests that we can create a LitQA question and evaluate answers.""" - qa_prompt, eval_fn = LitQAEvaluation.from_question( - ideal=ideal, - distractors=distractors, - question=question, - seed=42, # Seed for VCR cassette - ) - self._assert_prompt_is_valid(qa_prompt, question, ideal, distractors) - - evaluation = await eval_fn(answer) - assert evaluation == expected_eval - if evaluation == LitQAEvaluation.CORRECT: - assert evaluation.answer == ideal - assert evaluation.answer == extracted_answer - assert evaluation.ideal == ideal - assert evaluation.make_discounted_returns(3, discount=0.5) == expected_dreturns - - def test_consistent_mc_options(self) -> None: - """Tests that creating multiple evaluations with the same seed results in the same prompt.""" - question, ideal, distractors = self.MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS - - qa_prompt_1a, _ = LitQAEvaluation.from_question( - ideal=ideal, distractors=distractors, question=question, seed=0 - ) - self._assert_prompt_is_valid(qa_prompt_1a, question, ideal, distractors) - - qa_prompt_1b, _ = LitQAEvaluation.from_question( - ideal=ideal, distractors=distractors, question=question, seed=0 - ) - self._assert_prompt_is_valid(qa_prompt_1b, question, ideal, distractors) - assert qa_prompt_1a == qa_prompt_1b, "Same seeding should lead to same prompts" - - qa_prompt_2a, _ = LitQAEvaluation.from_question( - ideal=ideal, - distractors=distractors, - question=question, - seed=SEED_USING_QUESTION, - ) - self._assert_prompt_is_valid(qa_prompt_2a, question, ideal, distractors) - - qa_prompt_2b, _ = LitQAEvaluation.from_question( - ideal=ideal, - distractors=distractors, - question=question, - seed=SEED_USING_QUESTION, - ) - self._assert_prompt_is_valid(qa_prompt_2b, question, ideal, distractors) - assert ( - qa_prompt_2a == qa_prompt_2b - ), "Same seeding strategy should lead to same prompts" - assert ( - qa_prompt_2a != qa_prompt_1a - ), "Different seeding strategies should lead to different prompts" - - def test_creating_litqa_questions(self) -> None: - """Test making LitQA eval questions after downloading from Hugging Face Hub.""" - _, eval_split = read_litqa_v2_from_hub(seed=42) - assert len(eval_split) > 3 - assert [ - LitQAEvaluation.from_question( - ideal=cast(str, row.ideal), - distractors=cast(list[str], row.distractors), - question=cast(str, row.question), - seed=42, - )[0] - for row in eval_split[:3].itertuples() - ] == [ - ( - "Q: Which of the following mutations in yeast Pbs2 increases its" - " interaction with SH3?\n\nOptions:\nA) S83F\nB) I87W\nC) N92H\nD) K85W\nE)" - " Insufficient information to answer this question\nF) N92S\nG) P97A" - ), - ( - "Q: What percentage of colorectal cancer-associated fibroblasts typically" - " survive at 2 weeks if cultured with the platinum-based chemotherapy" - " oxaliplatin?\n\nOptions:\nA) 80-99%\nB) 1-20%\nC) 20-50%\nD) 50-80%\nE)" - " 0%\nF) Insufficient information to answer this question" - ), - ( - "Q: Which of the following genes shows the greatest difference in gene" - " expression between homologous cell types in mouse and human" - " brain?\n\nOptions:\nA) Htr3a\nB) Htr5a\nC) Htr6\nD) Insufficient" - " information to answer this question\nE) Htr1d" - ), - ] - - @pytest.mark.parametrize( - ("evals", "accuracy_precision"), - [ - ( - [ - LitQAEvaluation.CORRECT, - LitQAEvaluation.CORRECT, - LitQAEvaluation.CORRECT, - ], - (1, 1), - ), - (["correct", "correct", "unsure"], (2 / 3, 1)), - ( - [LitQAEvaluation.CORRECT, LitQAEvaluation.UNSURE, "incorrect"], - (1 / 3, 1 / 2), - ), - ], - ) - def test_calculate_accuracy_precision( - self, evals: Sequence[LitQAEvaluation], accuracy_precision: tuple[float, float] - ) -> None: - assert LitQAEvaluation.calculate_accuracy_precision(evals) == accuracy_precision + ( + "Q: Which of the following genes shows the greatest difference in gene" + " expression between homologous cell types in mouse and human" + " brain?\n\nOptions:\nA) Htr1d\nB) Insufficient information to answer this" + " question\nC) Htr6\nD) Htr5a\nE) Htr3a" + ), + ] diff --git a/tests/test_task.py b/tests/test_task.py index e1f14d83..dfe7a452 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -5,6 +5,7 @@ import pytest from aviary.env import TASK_DATASET_REGISTRY, TaskConfig, TaskDataset +from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion from ldp.agent import SimpleAgent from ldp.alg.callbacks import Callback, MeanMetricsCallback, StoreTrajectoriesCallback from ldp.alg.runners import Evaluator, EvaluatorConfig @@ -20,7 +21,7 @@ LitQAv2TaskSplit, ) from paperqa.agents.tools import GenerateAnswer -from paperqa.litqa import DEFAULT_REWARD_MAPPING, SEED_USING_QUESTION, LitQAEvaluation +from paperqa.litqa import DEFAULT_REWARD_MAPPING @pytest.fixture(name="base_query_request") @@ -60,7 +61,7 @@ def __init__(self, *args, **kwargs): def get_new_env_by_idx(self, idx: int) -> GradablePaperQAEnvironment: return self._make_gradable_environment( - ideal=self.data[idx][0], + ideal_answer=self.data[idx][0], distractors=self.data[idx][1], question=self.data[idx][2], sources=self.data[idx][3], @@ -87,7 +88,10 @@ def __init__(self): self.query_to_envs: dict[str, PaperQAEnvironment] = {} async def before_rollout(self, traj_id: str, env) -> None: # noqa: ARG002 - self.query_to_envs[env._query.query] = env + query: str | MultipleChoiceQuestion = env._query.query + self.query_to_envs[ + query if isinstance(query, str) else query.question_prompt + ] = env class TestTaskDataset: @@ -120,9 +124,9 @@ async def test___len__( obs, _ = await env.reset() assert ( "Q: SLC14A1 been identified as a specific marker for endothelial" - " cells in which organ?\n\nOptions:\nA) heart\nB) eye\nC)" - " prostate\nD) Insufficient information to answer this question\nE)" - " liver" in (obs[0].content or "") + " cells in which organ?\n\nOptions:\nA) liver\nB) eye\nC)" + " prostate\nD) heart\nE) Insufficient information to answer this" + " question" in (obs[0].content or "") ) assert env.sources, "Sources need to be accessible" assert isinstance( @@ -159,7 +163,7 @@ async def test_evaluation( "deleted_dockeys", } ), - "question_kwargs": {"seed": SEED_USING_QUESTION}, + "question_kwargs": {"seed": MultipleChoiceQuestion.SEED_USING_QUESTION}, }, ) # NOTE: set base_query after construction of the TaskConfig. because in @@ -193,7 +197,10 @@ async def test_evaluation( assert metrics_callback.eval_means["reward"] > 0, "Expected some wins" correct_reward, incorrect_reward = ( DEFAULT_REWARD_MAPPING[evaluation.value] - for evaluation in (LitQAEvaluation.CORRECT, LitQAEvaluation.INCORRECT) + for evaluation in ( + MultipleChoiceEvaluation.CORRECT, + MultipleChoiceEvaluation.INCORRECT, + ) ) worst_case_reward_given_correct = ( correct_reward * correct_percentage diff --git a/uv.lock b/uv.lock index 65519dd6..150522e2 100644 --- a/uv.lock +++ b/uv.lock @@ -240,7 +240,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -1643,7 +1643,7 @@ wheels = [ [[package]] name = "paper-qa" -version = "5.8.1.dev14+g919bf0c" +version = "5.8.1.dev15+gf68b03e.d20250102" source = { editable = "." } dependencies = [ { name = "aiohttp" }, @@ -1715,7 +1715,31 @@ zotero = [ [package.dev-dependencies] dev = [ - { name = "paper-qa", extra = ["dev"] }, + { name = "datasets" }, + { name = "ipython" }, + { name = "ldp" }, + { name = "mypy" }, + { name = "pandas-stubs" }, + { name = "pre-commit" }, + { name = "pydantic" }, + { name = "pylint-pydantic" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "pytest-recording" }, + { name = "pytest-rerunfailures" }, + { name = "pytest-subtests" }, + { name = "pytest-sugar" }, + { name = "pytest-timer", extra = ["colorama"] }, + { name = "pytest-xdist" }, + { name = "python-dotenv" }, + { name = "pyzotero" }, + { name = "qdrant-client" }, + { name = "refurb" }, + { name = "sentence-transformers" }, + { name = "typeguard" }, + { name = "types-pyyaml" }, + { name = "types-setuptools" }, + { name = "vcrpy" }, ] [package.metadata] @@ -1724,11 +1748,11 @@ requires-dist = [ { name = "anyio" }, { name = "datasets", marker = "extra == 'datasets'" }, { name = "fh-llm-client" }, - { name = "fhaviary", extras = ["llm"], specifier = ">=0.10.2" }, + { name = "fhaviary", extras = ["llm"], specifier = ">=0.14" }, { name = "html2text" }, { name = "httpx" }, { name = "ipython", marker = "extra == 'dev'", specifier = ">=8" }, - { name = "ldp", marker = "extra == 'ldp'", specifier = ">=0.16.0" }, + { name = "ldp", marker = "extra == 'ldp'", specifier = ">=0.17" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" }, { name = "numpy" }, { name = "pandas-stubs", marker = "extra == 'typing'" }, @@ -1860,7 +1884,7 @@ name = "portalocker" version = "2.10.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "pywin32", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891 } wheels = [ @@ -2170,7 +2194,7 @@ name = "pympler" version = "1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "pywin32", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dd/37/c384631908029676d8e7213dd956bb686af303a80db7afbc9be36bc49495/pympler-1.1.tar.gz", hash = "sha256:1eaa867cb8992c218430f1708fdaccda53df064144d1c5656b1e6f1ee6000424", size = 179954 } wheels = [ @@ -2935,21 +2959,21 @@ dependencies = [ { name = "fsspec" }, { name = "jinja2" }, { name = "networkx" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" }, { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, - { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" }, { name = "typing-extensions" }, ] wheels = [ @@ -2969,7 +2993,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [