diff --git a/src/aviary/utils.py b/src/aviary/utils.py index 7fdacd81..35687bd2 100644 --- a/src/aviary/utils.py +++ b/src/aviary/utils.py @@ -2,14 +2,27 @@ import contextlib import inspect import io +import random +import re +import string +from ast import literal_eval +from collections.abc import Awaitable, Callable, Sequence from enum import StrEnum -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast + +from pydantic import BaseModel, Field, model_validator + +try: + from litellm import acompletion +except ImportError: + acompletion = None if TYPE_CHECKING: import numpy as np -LLM_EVAL_CONFIG = { +DEFAULT_EVAL_MODEL_NAME = "gpt-4o" +LLM_BOOL_EVAL_CONFIG = { "prompt": ( "Here is a question, the correct answer to the question, and a proposed answer" " to the question. Please tell me if the proposed answer is correct, given the" @@ -18,11 +31,11 @@ "\n\nCorrect answer: {correct_answer}" "\n\nProposed answer: {proposed_answer}" ), - "model": "gpt-4o-mini", + "model": DEFAULT_EVAL_MODEL_NAME, "temperature": 0, } -LLM_SCORE_EVAL_CONFIG = LLM_EVAL_CONFIG | { +LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | { "prompt": ( "Here is a question, the correct answer to the question, and a rubric for" " evaluating the question. Judge the proposed answer based on the given rubric." @@ -41,6 +54,13 @@ class EvalAnswerMode(StrEnum): LLM = "llm" # Ask an LLM to evaluate LLM_SCORE = "llm-score" # Ask an LLM to evaluate and return the score (normalized) + def get_default_config(self) -> dict[str, Any]: + if self == EvalAnswerMode.LLM: + return LLM_BOOL_EVAL_CONFIG + if self == EvalAnswerMode.LLM_SCORE: + return LLM_SCORE_EVAL_CONFIG + return {} + def partial_format(value: str, **formats: dict[str, Any]) -> str: """Partially format a string given a variable amount of formats.""" @@ -88,6 +108,23 @@ def is_coroutine_callable(obj) -> bool: return False +async def run_prompt( + prompt: str, model: str = DEFAULT_EVAL_MODEL_NAME, temperature: float | None = None +) -> str: + try: + response = await acompletion( + model=model, + temperature=temperature, + messages=[{"content": prompt, "role": "user"}], + ) + except TypeError: + raise ImportError( + "Answer evaluation requires the 'llm' extra for 'litellm'. Please:" + " `pip install aviary[llm]`." + ) from None + return response.choices[0].message.content + + async def eval_answer( proposed: str, correct: str, @@ -101,40 +138,27 @@ async def eval_answer( """ eval_mode = EvalAnswerMode(eval_mode) if eval_mode in {EvalAnswerMode.LLM, EvalAnswerMode.LLM_SCORE}: - try: - from litellm import acompletion - except ImportError as e: - raise ImportError( - "eval_answer requires the 'llm' extra for 'litellm'. Please:" - " `pip install aviary[llm]`." - ) from e if question is None: raise ValueError("Question must be provided for LLM evaluation mode.") - default_config = ( - LLM_EVAL_CONFIG - if eval_mode == EvalAnswerMode.LLM - else LLM_SCORE_EVAL_CONFIG - ) + default_config = eval_mode.get_default_config() config = llm_eval_config or default_config prompt = cast(str, config.get("prompt", default_config["prompt"])).format( question=question, correct_answer=correct, proposed_answer=proposed, ) - response = await acompletion( + response_msg = await run_prompt( + prompt, model=config.get("model", default_config["model"]), temperature=config.get("temperature", default_config["temperature"]), - messages=[{"content": prompt, "role": "user"}], ) if eval_mode == EvalAnswerMode.LLM: return await eval_answer( - response.choices[0].message.content.strip().casefold(), - "yes", - eval_mode=EvalAnswerMode.EXACT, + response_msg.strip().casefold(), "yes", eval_mode=EvalAnswerMode.EXACT ) try: - return float(response.choices[0].content.strip()) / float( - config.get("max_score", default_config["max_score"]) # type: ignore[arg-type] + return float(response_msg.strip()) / float( + config.get("max_score", default_config["max_score"]) ) except ValueError: return 0 @@ -149,3 +173,182 @@ async def eval_answer( return float(gt in pred) raise RuntimeError(f"Invalid evaluation mode: {eval_mode}") + + +_CAPITAL_A_INDEX = ord("A") + + +class MultipleChoiceQuestion(BaseModel): + QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}" + # TODO: combine with above eval_answer and its prompts + EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = ( + "Given the following question and a proposed answer to the question, return the" + " single-letter choice in the question that matches the proposed answer." + " If the proposed answer is blank or an empty string," + " or multiple options are matched, respond with '0'." + "\n\nQuestion: {qa_prompt}" + "\n\nProposed Answer: {qa_answer}" + "\n\nSingle Letter Answer:" + ) + DEFAULT_UNSURE_OPTION: ClassVar[str] = ( + "Insufficient information to answer this question" + ) + SEED_USING_QUESTION: ClassVar[Literal["SEED_USING_QUESTION"]] = ( + "SEED_USING_QUESTION" + ) + + question: str = Field( + description="Question to answer (without multiple choice options)." + ) + options: Sequence[str] = Field(description="All multiple choice options.") + ideal_answer: str = Field( + description=( + "Desired ideal answer. If not one of the provided options, it will be" + " automatically added." + ) + ) + unsure_answer: str | None = Field( + default=DEFAULT_UNSURE_OPTION, + description=( + "Unsure answer text. If not one of the provided options, it will be" + " automatically added." + ), + ) + shuffle_seed: int | Literal["SEED_USING_QUESTION"] | None = Field( + default=None, + description=( + "Optional seed to use in randomization of options, where seeding is not" + " global (e.g. no `random.seed`). Optionally pass in the string literal" + " 'SEED_USING_QUESTION' to hash the question for the seed" + ), + ) + + @model_validator(mode="after") + def add_answers_and_shuffle(self) -> Self: + if self.ideal_answer not in self.options: + self.options = [*self.options, self.ideal_answer] + if self.unsure_answer and self.unsure_answer not in self.options: + self.options = [*self.options, self.unsure_answer] + if len(self.options) > len(string.ascii_lowercase): + raise NotImplementedError( + "Didn't handle more multiple choice options than letters, options were" + f" {self.options}." + ) + if self.shuffle_seed == self.SEED_USING_QUESTION: + self.shuffle_seed = hash(self.question) + if self.shuffle_seed is not None: + self.options = random.Random(self.shuffle_seed).sample( + self.options, k=len(self.options) + ) + # Ensure deserialization doesn't re-shuffle + self.shuffle_seed = None + return self + + @property + def ideal_answer_index(self) -> int: + return self.options.index(self.ideal_answer) + + @property + def unsure_answer_index(self) -> int | None: + if self.unsure_answer is None: + return None + return self.options.index(self.unsure_answer) + + @property + def question_prompt(self) -> str: + return self.QUESTION_PROMPT_TEMPLATE.format( + question=self.question, + options="\n".join([ + f"{_CAPITAL_A_INDEX + i:c}) {o}" for i, o in enumerate(self.options) + ]), + ) + + @staticmethod + def split_options(options: str) -> list[str]: + """Split options string into a list of options. + + Examples: + >>> MultipleChoiceQuestion.split_options("apples, mangos") + ['apples', 'mangos'] + """ + try: + split_options = literal_eval(options) + if not isinstance(split_options, list): + raise TypeError("Need split_options to be a list.") # noqa: TRY301 + except (ValueError, SyntaxError, TypeError): + split_options = [d.strip("'[ ]\"") for d in options.split(",")] + return split_options + + async def grade( + self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None + ) -> "tuple[MultipleChoiceEvaluation, str, str]": + if prompt_runner is None: + prompt_runner = run_prompt + eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format( + qa_prompt=self.question_prompt, qa_answer=answer + ) + raw_evaluation = await prompt_runner(eval_prompt) + evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer( + raw_evaluation, self + ) + return evaluation, raw_evaluation, parsed_answer + + +class MultipleChoiceEvaluation(StrEnum): + CORRECT = "correct" + INCORRECT = "incorrect" + UNSURE = "unsure" # May be irrelevant if no unsure option provided + + @classmethod + def calculate_accuracy_precision( + cls, evaluations: Sequence[Self | str] + ) -> tuple[float, float]: + """ + Calculate QA-specific accuracy and precision metrics upon evaluations. + + Raises: + ZeroDivisionError: if an empty input. + + Returns: + Two-tuple of accuracy = (num correct) / (num questions) and + precision = (num correct) / ((num questions) - (num unsure)). + """ + evaluations = [e if isinstance(e, cls) else cls(e) for e in evaluations] + num_correct = sum(e == cls.CORRECT for e in evaluations) + accuracy = num_correct / len(evaluations) + precision = num_correct / sum( + e in {cls.CORRECT, cls.INCORRECT} for e in evaluations + ) + return accuracy, precision + + @classmethod + def from_answer( + cls, answer: str, question: MultipleChoiceQuestion + ) -> "tuple[MultipleChoiceEvaluation, str]": + """Make an evaluation from the input answer and multiple choice question. + + Returns: + Two-tuple of answer enum and the raw answer extracted from the input answer. + """ + # SEE: https://regex101.com/r/vcE9Hb/1 + letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL) + # Get the letter answer, or fail over to the first non-whitespace char + answer_char = ( + letter_search.group(1) + if letter_search is not None + else answer.split()[0][0].upper() + ) + answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX + if answer_letter_index < 0 or answer_letter_index > len(question.options): + # The result extracted was not in the options (e.g. '0') + return cls.INCORRECT, answer_char + # From here, if we don't match either the ideal or the unsure multiple choice + # options then we declare the answer as incorrect. + if ( + question.unsure_answer_index is not None + and answer_letter_index == question.unsure_answer_index + ): + return cls.UNSURE, cast(str, question.unsure_answer) + if answer_letter_index == question.ideal_answer_index: + return cls.CORRECT, question.ideal_answer + return cls.INCORRECT, question.options[answer_letter_index] diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml new file mode 100644 index 00000000..be2df091 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-llm-has-innate-knowledge].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle + Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "513" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k + u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw + 47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK + RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie + H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu + tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6 + AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fde1cf88cf1b-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:29 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "363" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999874" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_aff8daa48aa43d3df077f97da6136e5a + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml new file mode 100644 index 00000000..38077163 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[didnt-match-and-no-llm-innate-knowledge].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer + is 14004\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "536" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJJLT8MwEITv+RWWzw1KS2lCb3BCSDykcuAhFBl7kxocr2Vveajqf0dO + 2yQVIHHJYb6dyewm64QxrhWfMy6XgmTjTHpWPV6fFzf1Jcwebu4WF4tbunKFwvzj/n3GR9GBL68g + ae86ktg4A6TRbrH0IAhi6jg/nk7zLC9OWtCgAhNttaN0iukkm0zTrEizXa5copYQ+Jw9JYwxtm6f + saJV8MnnLBvtlQZCEDXweTfEGPdoosJFCDqQsMRHPZRoCWzbOhvqHqpVELGWXRmz0zfdiwzWzuNL + 2PFOr7TVYVl6EAFtDA2Ejrd0kzD23C60OujIncfGUUn4BjYGjscn2zzen3BAd4yQhBmaZqNf4koF + JLQJg4twKeQSVG/tzydWSuMAJIOlf5b5LXu7uLb1f+J7ICU4AlU6D0rLw4X7MQ/xB/trrDtyW5iH + r0DQlJW2NXjn9fYbV64U+akqlJDjiieb5BsAAP//AwBRMcSQ7AIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdc63fbf9e53-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:25 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "212" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999868" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_afd8c66d84f3b42a8cd2b8a6bf855054 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml new file mode 100644 index 00000000..057ef1d0 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: \n\nSingle + Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "517" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFLLTsMwELznKyyfG+S2gaa5FegFJKRKnEAocp1NaurYlr3hoar/jpyG + JBUgcfFhZmc8O/YhIoTKgmaEih1HUVsVr8qnh/XbzfvdI7LN7WzF9vf8Mik31xtYN3QSFGb7CgK/ + VRfC1FYBSqNPtHDAEYLrdDFPkgVbpGlL1KYAFWSVxTgx8YzNkpilMbvqhDsjBXiakeeIEEIO7Rki + 6gI+aEbY5BupwXteAc36IUKoMyoglHsvPXKNdDKQwmgE3aZmY9xB2XgeYulGqQ4/9hcpU1lntr7j + e7yUWvpd7oB7o4OpR2Npyx4jQl7ahZqzjNQ6U1vM0exBB8MpW5786FDhiO04NMjVCJ52LZzb5QUg + l8qPGqGCix0Ug3SojzeFNCMiGi39M8xv3qfFpa7+Yz8QQoBFKHLroJDifOFhzEH4YH+N9SW3gan/ + 9Ah1XkpdgbNOnt64tPmy5Fu+LNk8pdEx+gIAAP//AwDTwVpp7AIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fddcea1d251d-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:28 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "174" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999872" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_4d40eb2c66dfd308a7b75c7cd80c405b + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml new file mode 100644 index 00000000..a0acce15 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: \n\nSingle Letter + Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "511" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJJRT4MwFIXf+RVNn4eBicPx5oszWTJjfNBoDOnaC9SVtmlLMrPsv5sW + HCzOxBceznfP4dwLhwghzBkuEKYNcbTVIr6r3jar1eb5IXtKb7brPL9/3e/a5fxFrOtHPPMOtf0E + 6n5cV1S1WoDjSvaYGiAOfGqaX2dZnuTLJIBWMRDeVmsXZyqeJ/MsTm7jZDEYG8UpWFyg9wghhA7h + 6StKBntcoBATlBasJTXg4jSEEDZKeAUTa7l1RDo8GyFV0oEMrZOpbqDqLPG1ZCfEoB9PLxKq1kZt + 7cBPesUlt01pgFglfah1SuNAjxFCH2Gh7qwj1ka12pVO7UD6wDRZ9Hl4POGEDswpR8TUlM8uxJUM + HOHCTi6CKaENsNE6no90jKsJiCZL/y5zKbtfnMv6P/EjoBS0A1ZqA4zT84XHMQP+B/tr7HTkUBjb + L+ugLSsuazDa8P4bV7ok+ZLdMkLTCkfH6BsAAP//AwBwbnWk7AIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fde81f05ceb1-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:30 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "332" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999875" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_817ca7ae018d7baa48236c7ad4f4f151 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml new file mode 100644 index 00000000..d70cc972 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer3].yaml @@ -0,0 +1,105 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What method + was used to demonstrate that the enzyme PafA is stable after incubation with + 4M urea for 14 days?\n\nOptions:\nA) cryo EM\nB) Insufficient information to + answer this question\nC) NMR\nD) x-ray crystallography\nE) circular dichroism\n\nProposed + Answer: \n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "624" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJLNboMwEITvPIXlM1SEkJJyS9VIlfpz6SmpKuSYhTg1tmVvpEZR3r0y + IUDVVOqFw3w7w+zCMSCEipLmhPItQ94YGS2q9eujmL9NXtaJWKW7wxM+r+TD8rC4ny5p6B16swOO + F9cN142RgEKrM+YWGIJPnWTTNM3i7G7SgkaXIL2tNhilOkriJI3ieRTfdsatFhwczcl7QAghx/bp + K6oSvmhO4vCiNOAcq4Hm/RAh1GrpFcqcEw6ZQhoOkGuFoNrW8Vi3UO0d87XUXspOP/Uvkro2Vm9c + x3u9Ekq4bWGBOa18qENtaEtPASEf7UL7Hx2psboxWKD+BOUDJ9PknEeHE45ox1Ajk2PTNLwSV5SA + TEg3ugjljG+hHKzD+di+FHoEgtHSv8tcyz4vLlT9n/gBcA4GoSyMhVLwnwsPYxb8D/bXWH/ktjB1 + B4fQFJVQNVhjxfkbV6aosvkMNrMqzWhwCr4BAAD//wMANO06tewCAAA= + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdedda0ceb36-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:31 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "259" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999845" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_223a9415a5a19029f86768ffbabf3d6f + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml new file mode 100644 index 00000000..7f73abaa --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-correct-option].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer + is 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "536" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6ggoXlwyyGKcql6Sx+qkLEXcGtsyzZSoyj/vTKQ + QNRU6oXDfDvD7MIpQAhzhjOEaU0cbbSINuXb02Z73L287ndH1ias2C8k3a0OB1Y949A7VPEJ1F1c + D1Q1WoDjSvaYGiAOfGqynKfpMl6u0g40ioHwtkq7KFXRLJ6lUbyK4sVgrBWnYHGG3gOEEDp1T19R + MvjGGYrDi9KAtaQCnF2HEMJGCa9gYi23jkiHwxFSJR3IrvV2qhsoW0t8LdkKMejn64uEqrRRhR34 + VS+55LbODRCrpA+1Tmnc0XOA0Ee3UHvTEWujGu1yp75A+sAkeezz8HjCCR2YU46IqWkR3onLGTjC + hZ1cBFNCa2CjdTwfaRlXExBMlv5d5l52vziX1X/iR0ApaAcs1wYYp7cLj2MG/A/219j1yF1hbI/W + QZOXXFZgtOH9Ny51vi5JQdZlPF/h4Bz8AAAA//8DAKuPA4PsAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdc06b78cf26-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:24 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "291" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999868" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_89e3d88c7f12861d7e774e452300b36d + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml new file mode 100644 index 00000000..45376f48 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-incorrect-option].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer + is 94106\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "536" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAA4ySX2uDMBTF3/0UIc912M7WzreVjv1hDPo06BiSJlfNFpMsibBS+t1H1KplHezF + h/O753ju1UOAEOYMpwjTkjhaaRHe5tuX1erpfsv3z8s4ed1sHvL1191+UT9WDk+8Q+0+gLqT64qq + SgtwXMkWUwPEgU+dJtdxnETJct6ASjEQ3lZoF8YqnEWzOIyWYbTojKXiFCxO0VuAEEKH5ukrSgbf + OEXR5KRUYC0pAKf9EELYKOEVTKzl1hHZ1u0gVdKBbFqvx7qBvLbE15K1EJ1+7F8kVKGN2tmO93rO + JbdlZoBYJX2odUrjhh4DhN6bheqzjlgbVWmXOfUJ0gdOp/M2Dw8nHNGOOeWIGJsWkwtxGQNHuLCj + i2BKaAlssA7nIzXjagSC0dK/y1zKbhfnsvhP/AAoBe2AZdoA4/R84WHMgP/B/hrrj9wUxnZvHVRZ + zmUBRhvefuNcZyS5YUtG6DTHwTH4AQAA//8DAK9WW8vsAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdcb6d3b645e-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:26 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "282" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999868" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_0f4462cd5dd31fe3e1a9d6847e563042 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml new file mode 100644 index 00000000..abe7094e --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-several-options].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: the answer + is 94106 or 94107\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "545" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jFJBS8MwGL33V4ScV+m2sm69CaLTgyjIDhMpWfK1zZYmIUnFMfbfJW3X + dqjgJYf3vvfyvpecAoQwZzhFmJbE0UqL8DbfPt8V+4U8bO6PL0/rTUnWD8XrVr59PkZ44hVqtwfq + LqobqiotwHElW5oaIA686zSZx3ESJcukISrFQHhZoV0Yq3AWzeIwWobRohOWilOwOEXvAUIInZrT + R5QMvnCKoskFqcBaUgBO+yGEsFHCI5hYy60j0uHJQFIlHcgmdTTGDeS1JT6WrIXo8HN/kVCFNmpn + O77Hcy65LTMDxCrpTa1TGjfsOUDoo1movsqItVGVdplTB5DecDpdtX54qHDEdpxTjogRPOtauLbL + GDjChR01gimhJbBBOtRHasbViAhGS/8M85t3uziXxX/sB4JS0A5Ypg0wTq8XHsYM+A/211hfchMY + 26N1UGU5lwUYbXj7xrnOVjnZkVUezZc4OAffAAAA//8DAAUNxI3sAgAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdd6bc71fa2e-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:27 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "249" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999865" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_72f1a2af642dad2884e52d652e775182 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml new file mode 100644 index 00000000..607cd8a2 --- /dev/null +++ b/tests/cassettes/TestLitQAEvaluation.test_grade[matched-unsure-option].yaml @@ -0,0 +1,104 @@ +interactions: + - request: + body: + '{"messages": [{"content": "Given the following question and a proposed + answer to the question, return the single-letter choice in the question that + matches the proposed answer. If the proposed answer is blank or an empty string, + or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is + my office''s zip code?\n\nOptions:\nA) -8\nB) Insufficient information to answer + this question\nC) cheesecake\nD) 94106\nE) 94107\n\nProposed Answer: Insufficient + information\n\nSingle Letter Answer:", "role": "user"}], "model": "gpt-4o"}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - "541" + content-type: + - application/json + host: + - api.openai.com + user-agent: + - AsyncOpenAI/Python 1.57.4 + x-stainless-arch: + - arm64 + x-stainless-async: + - async:asyncio + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.57.4 + x-stainless-raw-response: + - "true" + x-stainless-retry-count: + - "1" + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.12.7 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAAwAAAP//jJJLb4MwEITv/ArLZ6gIoDy4pS+lPeRWVWpVIWMWcGNsyzZSqyj/vTIQ + IGoq9cJhvp1hduHoIYRZgVOEaU0sbRQPtuXb/u4pP7w0z69kt3+I6221q/fN/SOJNPadQ+afQO3Z + dUNlozhYJkWPqQZiwaUuVnGSrMLVetmBRhbAna1SNkhkEIVREoTrIFwOxloyCgan6N1DCKFj93QV + RQFfOEWhf1YaMIZUgNNxCCGsJXcKJsYwY4mw2J8glcKC6FrfznUNZWuIqyVazgf9NL6Iy0ppmZuB + j3rJBDN1poEYKVyosVLhjp48hD66hdqLjlhp2SibWXkA4QIXi6jPw9MJZ3RgVlrC56bYvxKXFWAJ + 42Z2EUwJraGYrNP5SFswOQPebOnfZa5l94szUf0nfgKUgrJQZEpDwejlwtOYBveD/TU2HrkrjM23 + sdBkJRMVaKVZ/41LlW1KkpNNGcZr7J28HwAAAP//AwBPQ8gX7AIAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8f39fdd11ed0175e-SJC + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Tue, 17 Dec 2024 21:26:27 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - future-house-xr4tdh + openai-processing-ms: + - "196" + openai-version: + - "2020-10-01" + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - "10000" + x-ratelimit-limit-tokens: + - "30000000" + x-ratelimit-remaining-requests: + - "9999" + x-ratelimit-remaining-tokens: + - "29999867" + x-ratelimit-reset-requests: + - 6ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_9dd0f40823dceb910336a862f0513c68 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_eval_answer[llm basic].yaml b/tests/cassettes/test_eval_answer[llm basic].yaml index 6355f1c6..63f1bb18 100644 --- a/tests/cassettes/test_eval_answer[llm basic].yaml +++ b/tests/cassettes/test_eval_answer[llm basic].yaml @@ -5,9 +5,9 @@ interactions: question, and a proposed answer to the question. Please tell me if the proposed answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No other output is permitted.\n\nQuestion: Which of the following is most likely - true:\n\n A) Piggie, B) Pigeon, C) Gerald\n \n\nCorrect answer: C \n\nProposed + true:\n\nA) Piggie, B) Pigeon, C) Gerald\n\n\nCorrect answer: C\n\nProposed answer: Based on all factors considered, the most compelling answer is Gerald, - C", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}' + C", "role": "user"}], "model": "gpt-4o", "temperature": 0}' headers: accept: - application/json @@ -16,13 +16,13 @@ interactions: connection: - keep-alive content-length: - - "524" + - "516" content-type: - application/json host: - api.openai.com user-agent: - - AsyncOpenAI/Python 1.48.0 + - AsyncOpenAI/Python 1.57.4 x-stainless-arch: - arm64 x-stainless-async: @@ -32,7 +32,7 @@ interactions: x-stainless-os: - MacOS x-stainless-package-version: - - 1.48.0 + - 1.57.4 x-stainless-raw-response: - "true" x-stainless-retry-count: @@ -40,23 +40,24 @@ interactions: x-stainless-runtime: - CPython x-stainless-runtime-version: - - 3.12.5 + - 3.12.7 method: POST uri: https://api.openai.com/v1/chat/completions response: body: string: !!binary | - H4sIAAAAAAAAA3SQzW6DMBCE7zyFtedQASVQuKVqTlErtekhVVUhYxbi1tgWdtSfKO9eGQihh158 - mG9nPLtHjxDgFeQE2J5a1mrhr26f6yx8SjaP6y3fyfjnQS7p/Se73qx2d7BwDlW+I7Nn1xVTrRZo - uZIDZh1Siy41TKM0ym6ybNmDVlUonK3R1o+V33LJ/SiIYj9I/fBmdO8VZ2ggJ68eIYQc+9f1lBV+ - QU6CxVlp0RjaIOTTECHQKeEUoMZwY6m0sLhApqRF2Vd/WW/npMP6YKhrJw9CjPpp+kqoRneqNCOf - 9JpLbvZFh9Qo6WKNVRp6evIIeetXOvxpCbpTrbaFVR8oXWAYLIc8uFxyRkdmlaVibkr+MxUVWsqF - md0FhoJcNpeEYGrZrwnm21hsi5rLBjvd8eFMtS7CsoyTMEnrDLyT9wsAAP//AwDs99+sNAIAAA== + H4sIAAAAAAAAAwAAAP//jJI/b8IwEMX3fArLM6kCpPzbGDpQVahSh4pWVWTsS+Li+CzbkaCI7145 + AZKqVOri4X73nt+dfYwIoVLQBaG8ZJ5XRsXL/G09e3zePyX1YbxZrhzuynKTi9WXW7/SQVDg9hO4 + v6juOFZGgZeoW8wtMA/BdTgdp+k0mc5GDahQgAqywvg4xXiUjNI4mcXJ5CwsUXJwdEHeI0IIOTZn + iKgF7OmCJINLpQLnWAF0cW0ihFpUoUKZc9J5pj0ddJCj9qCb1JuHlz6xkNeOhWC6VupcP12vUlgY + i1t35td6LrV0ZWaBOdTB1nk0tKGniJCPZqT6R0pqLFbGZx53oIPhMLlv/Wi3xB49M4+eqb5oMrhh + lwnwTCrX2wnljJcgOmm3QFYLiT0Q9Yb+HeaWdzu41MV/7DvAORgPIjMWhOQ/B+7aLIQv9lfbdclN + YOoOzkOV5VIXYI2V7SvnJpvnbMvmeTKe0egUfQMAAP//AwAWS34s7gIAAA== headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8c8e093e1c03cf61-SJC + - 8f39fdb5cae1158a-SJC Connection: - keep-alive Content-Encoding: @@ -64,14 +65,14 @@ interactions: Content-Type: - application/json Date: - - Wed, 25 Sep 2024 21:16:35 GMT + - Tue, 17 Dec 2024 21:26:22 GMT Server: - cloudflare Set-Cookie: - - __cf_bm=yrnktGfLlTro_mHAi1eDs07tPAi.UO9vu4j2ZH79Tpg-1727298995-1.0.1.1-P4rF_tY6JKwj5f7PAq_pHP4sB0tH4oW2cKEw9s49GhjJh7xBzJvGDdQhTjpCmBpHQxeJgCvh8dFCe3ZZU6JlqA; - path=/; expires=Wed, 25-Sep-24 21:46:35 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=lVkT7i5qloNOJW3VW5kf8Ohm6U080WiPUv6XirXCoFk-1734470782-1.0.1.1-nAgxt2GizSWkF.auEc_j1tv3Erjbd74Lsh9WJmMaZa_E8fpVuEZ8SsBIqLBHICQDV0sfwSjHgP9mTBHQujl_XA; + path=/; expires=Tue, 17-Dec-24 21:56:22 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=NDOCPHgoZ2Ov2vByfj1bojq4LnSC2lXMWDthSvh6SdE-1727298995191-0.0.1.1-604800000; + - _cfuvid=YCWb3aZdtzEmsWTuiPgC.gchnL7jvJLEWh9yvJqAiAw-1734470782603-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked @@ -84,25 +85,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "92" + - "124" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "30000" + - "10000" x-ratelimit-limit-tokens: - - "150000000" + - "30000000" x-ratelimit-remaining-requests: - - "29999" + - "9999" x-ratelimit-remaining-tokens: - - "149999876" + - "29999876" x-ratelimit-reset-requests: - - 2ms + - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_782956501d1cbc9f3e44b8c9ceee21ec + - req_84a9ec4746765b74e4d84610ebc880ad status: code: 200 message: OK diff --git a/tests/cassettes/test_eval_llm_config.yaml b/tests/cassettes/test_eval_llm_config.yaml index 49310cb6..383479dc 100644 --- a/tests/cassettes/test_eval_llm_config.yaml +++ b/tests/cassettes/test_eval_llm_config.yaml @@ -4,8 +4,8 @@ interactions: '{"messages": [{"content": "Here is a question, the correct answer to the question, and a proposed answer to the question. Please tell me if the proposed answer is correct, given the correct answer. ONLY SAY ''YES'' OR ''NO''. No - other output is permitted.\n\nQuestion: What is 25 * 10? \n\nCorrect answer: - 250 \n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o-mini", "temperature": + other output is permitted.\n\nQuestion: What is 25 * 10?\n\nCorrect answer: + 250\n\nProposed answer: 250", "role": "user"}], "model": "gpt-4o", "temperature": 0.5}' headers: accept: @@ -15,13 +15,13 @@ interactions: connection: - keep-alive content-length: - - "394" + - "387" content-type: - application/json host: - api.openai.com user-agent: - - AsyncOpenAI/Python 1.48.0 + - AsyncOpenAI/Python 1.57.4 x-stainless-arch: - arm64 x-stainless-async: @@ -31,31 +31,32 @@ interactions: x-stainless-os: - MacOS x-stainless-package-version: - - 1.48.0 + - 1.57.4 x-stainless-raw-response: - "true" x-stainless-retry-count: - - "0" + - "1" x-stainless-runtime: - CPython x-stainless-runtime-version: - - 3.12.5 + - 3.12.7 method: POST uri: https://api.openai.com/v1/chat/completions response: body: string: !!binary | - H4sIAAAAAAAAA3SQS0/DMBCE7/kV1p4blEShedx4VEJwAFQ4IIQiJ9kkBr9kuwhU9b8jpyEpBy4+ - zLcznt19QAiwFkoCzUBdIzQPLy6fuuI5ij6z7vF6fZff39itYMNt2l89bGDlHap+x8b9us4aJTRH - x5Q84sYgdehT4yzJkiIvivMRCNUi97ZeuzBVoWCShUmUpGGUhXE+uQfFGrRQkteAEEL24+t7yha/ - oCTR6lcRaC3tEcp5iBAwinsFqLXMOiodrBbYKOlQjtVfNttTYrDbWerbyR3nk36Yv+Kq10bVduKz - 3jHJ7FAZpFZJH2ud0jDSQ0DI27jS7k9L0EYJ7SqnPlD6wHzaCJZDLjCemFOO8hNP/J+natFRxu3J - VeBYj8l+CYjmjuOSYL+tQ1F1TPZotGHHI3W6ius6XcfrrCsgOAQ/AAAA//8DAMNNIlYyAgAA + H4sIAAAAAAAAAwAAAP//jJLNasMwEITvfgqhc1ycH+okt0Ja6KUthBzaUowirW2lslZIG0gJefci + x4kT2kIvPsy3M55de58wxrXic8ZlLUg2zqR35dvTLF+8PJgRPhLUrtwszWKVPa92M+CD6MD1BiSd + XDcSG2eANNojlh4EQUwd5uPJJM/y6bgFDSow0VY5SieYjrLRJM2maXbbGWvUEgKfs/eEMcb27TNW + tAp2fM6ywUlpIARRAZ+fhxjjHk1UuAhBBxKW+KCHEi2BbVu/3i8viYdyG0QsZrfGdPrh/CqDlfO4 + Dh0/66W2OtSFBxHQxthA6HhLDwljH+1K26uW3HlsHBWEn2BjYD47xvH+hj0cdoyQhOnlaXeF67BC + AQltwsVFuBSyBtU7+/OJrdJ4AZKLlX92+S37uLa21X/ieyAlOAJVOA9Ky+t9+zEP8Qf7a+x84rYw + D1+BoClKbSvwzuvjNy5dIfKZmiohhyVPDsk3AAAA//8DADQLsKzsAgAA headers: CF-Cache-Status: - DYNAMIC CF-RAY: - - 8c8e093e1c06cf1b-SJC + - 8f39fdbac9db643b-SJC Connection: - keep-alive Content-Encoding: @@ -63,15 +64,9 @@ interactions: Content-Type: - application/json Date: - - Wed, 25 Sep 2024 21:16:35 GMT + - Tue, 17 Dec 2024 21:26:23 GMT Server: - cloudflare - Set-Cookie: - - __cf_bm=pym14eCUZA0SXlddekdvYvv4El15fQ5PcElgWShT4_0-1727298995-1.0.1.1-j7XxNBD437f6YvFS5ZFNifdIzdOW1J.vAbnfSMGsYibmP7pxa4RBjnc1c9T7BMJR8ctMw5kl28D5nFYJGkS7Zw; - path=/; expires=Wed, 25-Sep-24 21:46:35 GMT; domain=.api.openai.com; HttpOnly; - Secure; SameSite=None - - _cfuvid=ipHIXWJzY6qksCWMRv8pk2ueHiG5GB8g2GBZbMGPsBc-1727298995230-0.0.1.1-604800000; - path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None Transfer-Encoding: - chunked X-Content-Type-Options: @@ -83,25 +78,25 @@ interactions: openai-organization: - future-house-xr4tdh openai-processing-ms: - - "133" + - "231" openai-version: - "2020-10-01" strict-transport-security: - max-age=31536000; includeSubDomains; preload x-ratelimit-limit-requests: - - "30000" + - "10000" x-ratelimit-limit-tokens: - - "150000000" + - "30000000" x-ratelimit-remaining-requests: - - "29999" + - "9999" x-ratelimit-remaining-tokens: - - "149999907" + - "29999909" x-ratelimit-reset-requests: - - 2ms + - 6ms x-ratelimit-reset-tokens: - 0s x-request-id: - - req_02646a0ce82be0e9b7240968034e1d6d + - req_6538a77713d1ec9b61a8e15f3cf37377 status: code: 200 message: OK diff --git a/tests/test_utils.py b/tests/test_utils.py index de46749c..3469f01c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,10 @@ +from collections.abc import Iterable, Sequence + import pytest from aviary.core import eval_answer +from aviary.utils import MultipleChoiceEvaluation, MultipleChoiceQuestion +from tests.conftest import VCR_DEFAULT_MATCH_ON @pytest.mark.vcr @@ -40,3 +44,221 @@ async def test_eval_answer( async def test_eval_llm_config(): config = {"temperature": 0.5} assert await eval_answer("250", "250", "What is 25 * 10?", "llm", config) + + +class TestLitQAEvaluation: + @staticmethod + def _assert_prompt_is_valid( + mc_question: MultipleChoiceQuestion, + question: str, + ideal_answer: str, + distractors: Iterable[str], + ) -> None: + question_prompt = mc_question.question_prompt + for substr in ( + question, + "Insufficient information", + ideal_answer, + *distractors, + ): + assert question_prompt.count(substr) == 1 + + # Use for general purpose testing + ZIP_CODE_QUESTION_IDEAL_DISTRACTORS = ( + "What is my office's zip code?", + "94107", + ["-8", "94106", "cheesecake"], + ) + # The following two are used to check we don't leak on the LLM's innate knowledge + MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS = ( + "What is the meaning of life?", + "42", + ["-84", "11", "cheesecake"], + ) + # Source: https://github.com/Future-House/LAB-Bench/blob/43b2045c67a2da12c233689cf538f1ed5c42f590/LitQA2/litqa-v2-public.jsonl#L130 + LITQA2_QUESTION_IDEAL_DISTRACTORS = ( + ( + "What method was used to demonstrate that the enzyme PafA is stable after" + " incubation with 4M urea for 14 days?" + ), + "circular dichroism", + ["cryo EM", "x-ray crystallography", "NMR"], + ) + + @pytest.mark.asyncio + @pytest.mark.vcr(match_on=[*VCR_DEFAULT_MATCH_ON, "body"]) + @pytest.mark.parametrize( + ( + "question", + "ideal_answer", + "distractors", + "actual_answer", + "expected_eval", + "expected_extracted_answer", + ), + [ + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "the answer is 94107", + MultipleChoiceEvaluation.CORRECT, + "94107", + id="matched-correct-option", + ), + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "the answer is 14004", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="didnt-match-and-no-llm-innate-knowledge", + ), + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "the answer is 94106", + MultipleChoiceEvaluation.INCORRECT, + "94106", + id="matched-incorrect-option", + ), + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "Insufficient information", + MultipleChoiceEvaluation.UNSURE, + MultipleChoiceQuestion.DEFAULT_UNSURE_OPTION, + id="matched-unsure-option", + ), + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "the answer is 94106 or 94107", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="matched-several-options", + ), + pytest.param( + *ZIP_CODE_QUESTION_IDEAL_DISTRACTORS, + "", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="empty-answer1", + ), + pytest.param( + *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, + "14", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="didnt-match-and-llm-has-innate-knowledge", + ), + pytest.param( + *MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS, + "", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="empty-answer2", + ), + pytest.param( + *LITQA2_QUESTION_IDEAL_DISTRACTORS, + "", + MultipleChoiceEvaluation.INCORRECT, + "0", + id="empty-answer3", + ), + ], + ) + async def test_grade( + self, + question: str, + ideal_answer: str, + distractors: str | list[str], + actual_answer: str, + expected_eval: MultipleChoiceEvaluation, + expected_extracted_answer: str, + ) -> None: + """Tests that we can create a multiple choice question and evaluate answers.""" + mc_question = MultipleChoiceQuestion( + question=question, + options=distractors, + ideal_answer=ideal_answer, + shuffle_seed=42, # Seed for VCR cassette + ) + self._assert_prompt_is_valid(mc_question, question, ideal_answer, distractors) + evaluation, _, graded_answer = await mc_question.grade(actual_answer) + assert evaluation == expected_eval + if evaluation == MultipleChoiceEvaluation.CORRECT: + assert graded_answer == ideal_answer + assert graded_answer == expected_extracted_answer + + def test_consistent_mc_options(self) -> None: + """Tests that creating multiple evaluations with the same seed results in the same prompt.""" + question, ideal, distractors = self.MEANING_OF_LIFE_QUESTION_IDEAL_DISTRACTORS + mc_question_1a = MultipleChoiceQuestion( + question=question, ideal_answer=ideal, options=distractors, shuffle_seed=0 + ) + self._assert_prompt_is_valid(mc_question_1a, question, ideal, distractors) + + mc_question_1b = MultipleChoiceQuestion( + question=question, ideal_answer=ideal, options=distractors, shuffle_seed=0 + ) + self._assert_prompt_is_valid(mc_question_1b, question, ideal, distractors) + assert mc_question_1a == mc_question_1b, ( + "Same seeding should lead to same prompts" + ) + + mc_question_1a_copy = MultipleChoiceQuestion(**mc_question_1a.model_dump()) + self._assert_prompt_is_valid(mc_question_1a_copy, question, ideal, distractors) + assert mc_question_1a == mc_question_1b, ( + "Serialization then deserialization should lead to same prompts" + ) + + mc_question_2a = MultipleChoiceQuestion( + question=question, + ideal_answer=ideal, + options=distractors, + shuffle_seed=MultipleChoiceQuestion.SEED_USING_QUESTION, + ) + self._assert_prompt_is_valid(mc_question_2a, question, ideal, distractors) + + mc_question_2b = MultipleChoiceQuestion( + question=question, + ideal_answer=ideal, + options=distractors, + shuffle_seed=MultipleChoiceQuestion.SEED_USING_QUESTION, + ) + self._assert_prompt_is_valid(mc_question_2b, question, ideal, distractors) + assert mc_question_2a == mc_question_2b, ( + "Same seeding strategy should lead to same prompts" + ) + assert mc_question_2a != mc_question_1a, ( + "Different seeding strategies should lead to different prompts" + ) + + +class TestMultipleChoiceEvaluation: + @pytest.mark.parametrize( + ("evals", "accuracy_precision"), + [ + ( + [ + MultipleChoiceEvaluation.CORRECT, + MultipleChoiceEvaluation.CORRECT, + MultipleChoiceEvaluation.CORRECT, + ], + (1, 1), + ), + (["correct", "correct", "unsure"], (2 / 3, 1)), + ( + [ + MultipleChoiceEvaluation.CORRECT, + MultipleChoiceEvaluation.UNSURE, + "incorrect", + ], + (1 / 3, 1 / 2), + ), + ], + ) + def test_calculate_accuracy_precision( + self, + evals: Sequence[MultipleChoiceEvaluation], + accuracy_precision: tuple[float, float], + ) -> None: + assert ( + MultipleChoiceEvaluation.calculate_accuracy_precision(evals) + == accuracy_precision + )