From a7aa6bed833dc510aff0e957ac5b1b96abe5d935 Mon Sep 17 00:00:00 2001 From: Daniel Zhu Date: Thu, 18 Jul 2024 13:10:57 -0700 Subject: [PATCH] chore: rename factual knowledge scores (#312) --- .../eval_algorithms/factual_knowledge.py | 10 +-- src/fmeval/reporting/constants.py | 6 +- src/fmeval/reporting/eval_output_cells.py | 6 +- test/integration/test_factual_knowledge.py | 46 +++++----- .../eval_algorithms/test_factual_knowledge.py | 84 +++++++++---------- 5 files changed, 77 insertions(+), 75 deletions(-) diff --git a/src/fmeval/eval_algorithms/factual_knowledge.py b/src/fmeval/eval_algorithms/factual_knowledge.py index c0ad296a..86cda648 100644 --- a/src/fmeval/eval_algorithms/factual_knowledge.py +++ b/src/fmeval/eval_algorithms/factual_knowledge.py @@ -25,9 +25,9 @@ from fmeval.transforms.util import validate_call from fmeval.util import get_eval_results_path -EXACT_INCLUSION = "exact_inclusion" -QUASI_EXACT_INCLUSION = "quasi_exact_inclusion" -SCORE_NAMES = [EXACT_INCLUSION, QUASI_EXACT_INCLUSION] +FACTUAL_KNOWLEDGE = EvalAlgorithm.FACTUAL_KNOWLEDGE.value +FACTUAL_KNOWLEDGE_QUASI_EXACT = "factual_knowledge_quasi_exact" +SCORE_NAMES = [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT] logger = logging.getLogger(__name__) @@ -69,8 +69,8 @@ def _quasi_exact_inclusion_score(model_output: str, target_output: str) -> float FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = { - EXACT_INCLUSION: _exact_inclusion_score, - QUASI_EXACT_INCLUSION: _quasi_exact_inclusion_score, + FACTUAL_KNOWLEDGE: _exact_inclusion_score, + FACTUAL_KNOWLEDGE_QUASI_EXACT: _quasi_exact_inclusion_score, } diff --git a/src/fmeval/reporting/constants.py b/src/fmeval/reporting/constants.py index ce6a9c78..4a0e6098 100644 --- a/src/fmeval/reporting/constants.py +++ b/src/fmeval/reporting/constants.py @@ -1,7 +1,7 @@ from enum import Enum from typing import NamedTuple, Tuple, List -from fmeval.eval_algorithms.factual_knowledge import EXACT_INCLUSION, QUASI_EXACT_INCLUSION +from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING, LOG_PROBABILITY_DIFFERENCE from fmeval.eval_algorithms.qa_accuracy import ( F1_SCORE, @@ -167,8 +167,8 @@ class ListType(str, Enum): # Score descriptions for the report SCORE_DESCRIPTIONS = { # Factual knowledge - EXACT_INCLUSION: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.", - QUASI_EXACT_INCLUSION: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.", + FACTUAL_KNOWLEDGE: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.", + FACTUAL_KNOWLEDGE_QUASI_EXACT: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.", # Prompt stereotyping PROMPT_STEREOTYPING: "The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., p(Smore) > p(Sless). It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5", LOG_PROBABILITY_DIFFERENCE: "For each sentence pair, we report the log probability difference, a value ranging -∞ to ∞, indicating how much the model stereotypes. ", diff --git a/src/fmeval/reporting/eval_output_cells.py b/src/fmeval/reporting/eval_output_cells.py index f1891444..7534f880 100644 --- a/src/fmeval/reporting/eval_output_cells.py +++ b/src/fmeval/reporting/eval_output_cells.py @@ -11,7 +11,7 @@ get_default_prompt_template, ) from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE -from fmeval.eval_algorithms.factual_knowledge import EXACT_INCLUSION, QUASI_EXACT_INCLUSION +from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING from fmeval.constants import DatasetColumns, DATASET_COLUMNS @@ -340,7 +340,9 @@ def __init__( present_columns = [col for col in dataset.columns() if col in columns] dataset = dataset.select_columns(present_columns) is_binary_score = ( - True if score_name in [EXACT_INCLUSION, QUASI_EXACT_INCLUSION, CLASSIFICATION_ACCURACY_SCORE] else False + True + if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE] + else False ) cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score)) super().__init__(*cells) diff --git a/test/integration/test_factual_knowledge.py b/test/integration/test_factual_knowledge.py index 56d879b0..041874b4 100644 --- a/test/integration/test_factual_knowledge.py +++ b/test/integration/test_factual_knowledge.py @@ -9,8 +9,8 @@ from fmeval.eval_algorithms.factual_knowledge import ( FactualKnowledge, FactualKnowledgeConfig, - EXACT_INCLUSION, - QUASI_EXACT_INCLUSION, + FACTUAL_KNOWLEDGE, + FACTUAL_KNOWLEDGE_QUASI_EXACT, ) from fmeval.data_loaders.data_config import DataConfig from fmeval.constants import MIME_TYPE_JSONLINES @@ -33,9 +33,9 @@ def test_evaluate_sample(self): ) # the model produces deterministic output for eval_score in eval_scores: - if eval_score.name == EXACT_INCLUSION: + if eval_score.name == FACTUAL_KNOWLEDGE: assert eval_score.value == 1.0 - elif eval_score.name == QUASI_EXACT_INCLUSION: + elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT: assert eval_score.value == 1.0 class EvaluateTestCase(NamedTuple): @@ -44,23 +44,23 @@ class EvaluateTestCase(NamedTuple): category_scores: Dict[str, Dict[str, float]] DATASET_SCORES = [ - EvalScore(name=EXACT_INCLUSION, value=0.547), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.547), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.547), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.547), ] CATEGORY_SCORES = [ CategoryScore( name="Capitals", scores=[ - EvalScore(name=EXACT_INCLUSION, value=0.09), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.09), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.09), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.09), ], ), CategoryScore( name="Subsidiary", scores=[ - EvalScore(name=EXACT_INCLUSION, value=0.0198), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0198), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0198), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0198), ], ), ] @@ -70,10 +70,10 @@ class EvaluateTestCase(NamedTuple): [ EvaluateTestCase( dataset_name="trex_sample.jsonl", - dataset_score={EXACT_INCLUSION: 0.0547, QUASI_EXACT_INCLUSION: 0.0547}, + dataset_score={FACTUAL_KNOWLEDGE: 0.0547, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0547}, category_scores={ - "Capitals": {EXACT_INCLUSION: 0.09, QUASI_EXACT_INCLUSION: 0.09}, - "Subsidiary": {EXACT_INCLUSION: 0.0198, QUASI_EXACT_INCLUSION: 0.0198}, + "Capitals": {FACTUAL_KNOWLEDGE: 0.09, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.09}, + "Subsidiary": {FACTUAL_KNOWLEDGE: 0.0198, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0198}, }, ), # The purpose of testing evaluate() on this tiny dataset is to @@ -83,8 +83,8 @@ class EvaluateTestCase(NamedTuple): # See https://github.com/ray-project/ray/pull/39960 EvaluateTestCase( dataset_name="trex_sample_small.jsonl", - dataset_score={EXACT_INCLUSION: 0.0, QUASI_EXACT_INCLUSION: 0.0}, - category_scores={"Capitals": {EXACT_INCLUSION: 0.0, QUASI_EXACT_INCLUSION: 0.0}}, + dataset_score={FACTUAL_KNOWLEDGE: 0.0, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0}, + category_scores={"Capitals": {FACTUAL_KNOWLEDGE: 0.0, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0}}, ), ], ) @@ -107,20 +107,20 @@ def test_evaluate(self, integration_tests_dir, test_case): for eval_score in eval_output.dataset_scores: # pragma: no branch - if eval_score.name == EXACT_INCLUSION: - assert eval_score.value == approx(test_case.dataset_score[EXACT_INCLUSION], abs=ABS_TOL) - elif eval_score.name == QUASI_EXACT_INCLUSION: - assert eval_score.value == approx(test_case.dataset_score[QUASI_EXACT_INCLUSION], abs=ABS_TOL) + if eval_score.name == FACTUAL_KNOWLEDGE: + assert eval_score.value == approx(test_case.dataset_score[FACTUAL_KNOWLEDGE], abs=ABS_TOL) + elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT: + assert eval_score.value == approx(test_case.dataset_score[FACTUAL_KNOWLEDGE_QUASI_EXACT], abs=ABS_TOL) for category_score in eval_output.category_scores: # pragma: no branch for eval_score in category_score.scores: - if eval_score.name == EXACT_INCLUSION: + if eval_score.name == FACTUAL_KNOWLEDGE: assert eval_score.value == approx( - test_case.category_scores[category_score.name][EXACT_INCLUSION], abs=ABS_TOL + test_case.category_scores[category_score.name][FACTUAL_KNOWLEDGE], abs=ABS_TOL ) - elif eval_score.name == QUASI_EXACT_INCLUSION: + elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT: assert eval_score.value == approx( - test_case.category_scores[category_score.name][QUASI_EXACT_INCLUSION], abs=ABS_TOL + test_case.category_scores[category_score.name][FACTUAL_KNOWLEDGE_QUASI_EXACT], abs=ABS_TOL ) def test_evaluate_multi_datasets(self, integration_tests_dir): diff --git a/test/unit/eval_algorithms/test_factual_knowledge.py b/test/unit/eval_algorithms/test_factual_knowledge.py index 46da1bba..ea8cc548 100644 --- a/test/unit/eval_algorithms/test_factual_knowledge.py +++ b/test/unit/eval_algorithms/test_factual_knowledge.py @@ -15,8 +15,8 @@ from fmeval.eval_algorithms.factual_knowledge import ( FactualKnowledge, FactualKnowledgeConfig, - EXACT_INCLUSION, - QUASI_EXACT_INCLUSION, + FACTUAL_KNOWLEDGE, + FACTUAL_KNOWLEDGE_QUASI_EXACT, _exact_inclusion_score, _quasi_exact_inclusion_score, ) @@ -113,8 +113,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -124,8 +124,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -135,8 +135,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -146,8 +146,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), # Adding tests for quasi-exact inclusion @@ -158,8 +158,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -169,8 +169,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -180,8 +180,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -191,8 +191,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="OR", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), # tests that all facts are in the model output @@ -203,8 +203,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), # tests out of order @@ -215,8 +215,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -226,8 +226,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -238,8 +238,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -250,8 +250,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -261,8 +261,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), TestCaseFactualKnowledgeEvaluateSample( @@ -272,8 +272,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0), ], ), # none of the target facts are contained in the model output @@ -284,8 +284,8 @@ class TestCaseFactualKnowledgeEvaluateSample(NamedTuple): target_delimiter="", logic_operator="AND", expected_response=[ - EvalScore(name=EXACT_INCLUSION, value=0.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0), ], ), ], @@ -359,29 +359,29 @@ class TestCaseFactualKnowledgeEvaluate(NamedTuple): dataset_name="my_custom_dataset", prompt_template=None, dataset_scores=[ - EvalScore(name=EXACT_INCLUSION, value=2 / 3), - EvalScore(name=QUASI_EXACT_INCLUSION, value=5 / 6), + EvalScore(name=FACTUAL_KNOWLEDGE, value=2 / 3), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=5 / 6), ], category_scores=[ CategoryScore( name="Capitals", scores=[ - EvalScore(name=EXACT_INCLUSION, value=0.5), - EvalScore(name=QUASI_EXACT_INCLUSION, value=0.5), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.5), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.5), ], ), CategoryScore( name="Movies", scores=[ - EvalScore(name=EXACT_INCLUSION, value=1.0), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), CategoryScore( name="History", scores=[ - EvalScore(name=EXACT_INCLUSION, value=0.5), - EvalScore(name=QUASI_EXACT_INCLUSION, value=1.0), + EvalScore(name=FACTUAL_KNOWLEDGE, value=0.5), + EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=1.0), ], ), ],