Skip to content

Commit

Permalink
chore: rename factual knowledge scores (#312)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielezhu authored Jul 18, 2024
1 parent a38c880 commit a7aa6be
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 75 deletions.
10 changes: 5 additions & 5 deletions src/fmeval/eval_algorithms/factual_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
from fmeval.transforms.util import validate_call
from fmeval.util import get_eval_results_path

EXACT_INCLUSION = "exact_inclusion"
QUASI_EXACT_INCLUSION = "quasi_exact_inclusion"
SCORE_NAMES = [EXACT_INCLUSION, QUASI_EXACT_INCLUSION]
FACTUAL_KNOWLEDGE = EvalAlgorithm.FACTUAL_KNOWLEDGE.value
FACTUAL_KNOWLEDGE_QUASI_EXACT = "factual_knowledge_quasi_exact"
SCORE_NAMES = [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT]

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -69,8 +69,8 @@ def _quasi_exact_inclusion_score(model_output: str, target_output: str) -> float


FACTUAL_KNOWLEDGE_SCORES_TO_FUNCS: Dict[str, Callable[..., float]] = {
EXACT_INCLUSION: _exact_inclusion_score,
QUASI_EXACT_INCLUSION: _quasi_exact_inclusion_score,
FACTUAL_KNOWLEDGE: _exact_inclusion_score,
FACTUAL_KNOWLEDGE_QUASI_EXACT: _quasi_exact_inclusion_score,
}


Expand Down
6 changes: 3 additions & 3 deletions src/fmeval/reporting/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from typing import NamedTuple, Tuple, List

from fmeval.eval_algorithms.factual_knowledge import EXACT_INCLUSION, QUASI_EXACT_INCLUSION
from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT
from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING, LOG_PROBABILITY_DIFFERENCE
from fmeval.eval_algorithms.qa_accuracy import (
F1_SCORE,
Expand Down Expand Up @@ -167,8 +167,8 @@ class ListType(str, Enum):
# Score descriptions for the report
SCORE_DESCRIPTIONS = {
# Factual knowledge
EXACT_INCLUSION: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.",
QUASI_EXACT_INCLUSION: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.",
FACTUAL_KNOWLEDGE: "Exact inclusion score is a binary score where 1 indicates that the target output is contained in the model output, and 0 indicates otherwise. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts.",
FACTUAL_KNOWLEDGE_QUASI_EXACT: "Quasi-exact inclusion is a binary score that represents whether a slightly modified (removing excess whitespace and punctuation) version of the target output is contained in the model output. If yes, the score is 1, otherwise 0. The average over the whole dataset is reported, such that the final score indicates the percentage of correctly retrieved real-world facts after removing excess whitespace and punctuation.",
# Prompt stereotyping
PROMPT_STEREOTYPING: "The is_biased prompt stereotyping score measures whether the model assigns higher probability to the more stereotypical sentence than the less stereotypical one, i.e., <math><box>p(S<sub>more</sub>) > p(S<sub>less</sub>)</box></math>. It is a value between 0 and 1, where 1 indicates that the model always prefers the more stereotypical sentence while 0 means that it never prefers the more stereotypical sentence. An unbiased model prefers more and less stereotypical sentences at equal rates, corresponding to a score of 0.5",
LOG_PROBABILITY_DIFFERENCE: "For each sentence pair, we report the log probability difference, a value ranging -&#8734; to &#8734;, indicating how much the model stereotypes. ",
Expand Down
6 changes: 4 additions & 2 deletions src/fmeval/reporting/eval_output_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
get_default_prompt_template,
)
from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE
from fmeval.eval_algorithms.factual_knowledge import EXACT_INCLUSION, QUASI_EXACT_INCLUSION
from fmeval.eval_algorithms.factual_knowledge import FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT
from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE
from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
from fmeval.constants import DatasetColumns, DATASET_COLUMNS
Expand Down Expand Up @@ -340,7 +340,9 @@ def __init__(
present_columns = [col for col in dataset.columns() if col in columns]
dataset = dataset.select_columns(present_columns)
is_binary_score = (
True if score_name in [EXACT_INCLUSION, QUASI_EXACT_INCLUSION, CLASSIFICATION_ACCURACY_SCORE] else False
True
if score_name in [FACTUAL_KNOWLEDGE, FACTUAL_KNOWLEDGE_QUASI_EXACT, CLASSIFICATION_ACCURACY_SCORE]
else False
)
cells.append(ScoreTableCell(dataset, score_column_name, binary=is_binary_score))
super().__init__(*cells)
Expand Down
46 changes: 23 additions & 23 deletions test/integration/test_factual_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from fmeval.eval_algorithms.factual_knowledge import (
FactualKnowledge,
FactualKnowledgeConfig,
EXACT_INCLUSION,
QUASI_EXACT_INCLUSION,
FACTUAL_KNOWLEDGE,
FACTUAL_KNOWLEDGE_QUASI_EXACT,
)
from fmeval.data_loaders.data_config import DataConfig
from fmeval.constants import MIME_TYPE_JSONLINES
Expand All @@ -33,9 +33,9 @@ def test_evaluate_sample(self):
)
# the model produces deterministic output
for eval_score in eval_scores:
if eval_score.name == EXACT_INCLUSION:
if eval_score.name == FACTUAL_KNOWLEDGE:
assert eval_score.value == 1.0
elif eval_score.name == QUASI_EXACT_INCLUSION:
elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT:
assert eval_score.value == 1.0

class EvaluateTestCase(NamedTuple):
Expand All @@ -44,23 +44,23 @@ class EvaluateTestCase(NamedTuple):
category_scores: Dict[str, Dict[str, float]]

DATASET_SCORES = [
EvalScore(name=EXACT_INCLUSION, value=0.547),
EvalScore(name=QUASI_EXACT_INCLUSION, value=0.547),
EvalScore(name=FACTUAL_KNOWLEDGE, value=0.547),
EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.547),
]

CATEGORY_SCORES = [
CategoryScore(
name="Capitals",
scores=[
EvalScore(name=EXACT_INCLUSION, value=0.09),
EvalScore(name=QUASI_EXACT_INCLUSION, value=0.09),
EvalScore(name=FACTUAL_KNOWLEDGE, value=0.09),
EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.09),
],
),
CategoryScore(
name="Subsidiary",
scores=[
EvalScore(name=EXACT_INCLUSION, value=0.0198),
EvalScore(name=QUASI_EXACT_INCLUSION, value=0.0198),
EvalScore(name=FACTUAL_KNOWLEDGE, value=0.0198),
EvalScore(name=FACTUAL_KNOWLEDGE_QUASI_EXACT, value=0.0198),
],
),
]
Expand All @@ -70,10 +70,10 @@ class EvaluateTestCase(NamedTuple):
[
EvaluateTestCase(
dataset_name="trex_sample.jsonl",
dataset_score={EXACT_INCLUSION: 0.0547, QUASI_EXACT_INCLUSION: 0.0547},
dataset_score={FACTUAL_KNOWLEDGE: 0.0547, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0547},
category_scores={
"Capitals": {EXACT_INCLUSION: 0.09, QUASI_EXACT_INCLUSION: 0.09},
"Subsidiary": {EXACT_INCLUSION: 0.0198, QUASI_EXACT_INCLUSION: 0.0198},
"Capitals": {FACTUAL_KNOWLEDGE: 0.09, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.09},
"Subsidiary": {FACTUAL_KNOWLEDGE: 0.0198, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0198},
},
),
# The purpose of testing evaluate() on this tiny dataset is to
Expand All @@ -83,8 +83,8 @@ class EvaluateTestCase(NamedTuple):
# See https://github.com/ray-project/ray/pull/39960
EvaluateTestCase(
dataset_name="trex_sample_small.jsonl",
dataset_score={EXACT_INCLUSION: 0.0, QUASI_EXACT_INCLUSION: 0.0},
category_scores={"Capitals": {EXACT_INCLUSION: 0.0, QUASI_EXACT_INCLUSION: 0.0}},
dataset_score={FACTUAL_KNOWLEDGE: 0.0, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0},
category_scores={"Capitals": {FACTUAL_KNOWLEDGE: 0.0, FACTUAL_KNOWLEDGE_QUASI_EXACT: 0.0}},
),
],
)
Expand All @@ -107,20 +107,20 @@ def test_evaluate(self, integration_tests_dir, test_case):

for eval_score in eval_output.dataset_scores:
# pragma: no branch
if eval_score.name == EXACT_INCLUSION:
assert eval_score.value == approx(test_case.dataset_score[EXACT_INCLUSION], abs=ABS_TOL)
elif eval_score.name == QUASI_EXACT_INCLUSION:
assert eval_score.value == approx(test_case.dataset_score[QUASI_EXACT_INCLUSION], abs=ABS_TOL)
if eval_score.name == FACTUAL_KNOWLEDGE:
assert eval_score.value == approx(test_case.dataset_score[FACTUAL_KNOWLEDGE], abs=ABS_TOL)
elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT:
assert eval_score.value == approx(test_case.dataset_score[FACTUAL_KNOWLEDGE_QUASI_EXACT], abs=ABS_TOL)

for category_score in eval_output.category_scores: # pragma: no branch
for eval_score in category_score.scores:
if eval_score.name == EXACT_INCLUSION:
if eval_score.name == FACTUAL_KNOWLEDGE:
assert eval_score.value == approx(
test_case.category_scores[category_score.name][EXACT_INCLUSION], abs=ABS_TOL
test_case.category_scores[category_score.name][FACTUAL_KNOWLEDGE], abs=ABS_TOL
)
elif eval_score.name == QUASI_EXACT_INCLUSION:
elif eval_score.name == FACTUAL_KNOWLEDGE_QUASI_EXACT:
assert eval_score.value == approx(
test_case.category_scores[category_score.name][QUASI_EXACT_INCLUSION], abs=ABS_TOL
test_case.category_scores[category_score.name][FACTUAL_KNOWLEDGE_QUASI_EXACT], abs=ABS_TOL
)

def test_evaluate_multi_datasets(self, integration_tests_dir):
Expand Down
Loading

0 comments on commit a7aa6be

Please sign in to comment.