Skip to content

Commit

Permalink
Merge branch 'main' into docstrings-linting
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista authored Apr 4, 2024
2 parents a1215f7 + bf8453e commit 2336f6b
Show file tree
Hide file tree
Showing 14 changed files with 755 additions and 131 deletions.
21 changes: 8 additions & 13 deletions haystack/components/evaluators/answer_exact_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@
class AnswerExactMatchEvaluator:
"""
Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
matched one of the ground truth answers.
Each question can have multiple ground truth answers and multiple predicted answers.
The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
that matched one of the ground truth answers.
There can be multiple ground truth answers and multiple predicted answers as input.
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
evaluator = AnswerExactMatchEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Berlin"], ["Lyon"]],
)
Expand All @@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
"""

@component.output_types(individual_scores=List[int], score=float)
def run(
self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
) -> Dict[str, Any]:
def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
"""
Run the AnswerExactMatchEvaluator on the given inputs.
All lists must have the same length.
`ground_truth_answers` and `retrieved_answers` must have the same length.
:param questions:
A list of questions.
:param ground_truth_answers:
A list of expected answers for each question.
:param predicted_answers:
Expand All @@ -49,8 +44,8 @@ def run(
- `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
answer matched one of the ground truth answers.
"""
if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
if not len(ground_truth_answers) == len(predicted_answers):
raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")

matches = []
for truths, extracted in zip(ground_truth_answers, predicted_answers):
Expand All @@ -60,6 +55,6 @@ def run(
matches.append(0)

# The proportion of questions where any predicted answer matched one of the ground truth answers
average = sum(matches) / len(questions)
average = sum(matches) / len(predicted_answers)

return {"individual_scores": matches, "score": average}
84 changes: 84 additions & 0 deletions haystack/components/evaluators/document_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentMeanAveragePrecision:
"""
Evaluator that calculates the mean average precision of the retrieved documents, a metric
that measures how high retrieved documents are ranked.
Each question can have multiple ground truth documents and multiple retrieved documents.
`DocumentMeanAveragePrecision` doesn't normalize its inputs, the `DocumentCleaner` component
should be used to clean and normalize the documents before passing them to this evaluator.
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
],
)
print(result["individual_scores"])
# [1.0, 0.8333333333333333]
print(result["score"])
# 0.9166666666666666
```
"""

@component.output_types(score=float, individual_scores=List[float])
def run(
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentMeanAveragePrecision on the given inputs.
All lists must have the same length.
:param ground_truth_documents:
A list of expected documents for each question.
:param retrieved_documents:
A list of retrieved documents for each question.
:returns:
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
"""
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

individual_scores = []

for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
score = 0.0
for ground_document in ground_truth:
if ground_document.content is None:
continue

average_precision = 0.0
relevant_documents = 0

for rank, retrieved_document in enumerate(retrieved):
if retrieved_document.content is None:
continue

if ground_document.content in retrieved_document.content:
relevant_documents += 1
average_precision += relevant_documents / (rank + 1)
if relevant_documents > 0:
score = average_precision / relevant_documents
individual_scores.append(score)

score = sum(individual_scores) / len(retrieved_documents)

return {"score": score, "individual_scores": individual_scores}
79 changes: 79 additions & 0 deletions haystack/components/evaluators/document_mrr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentMeanReciprocalRank:
"""
Evaluator that calculates the mean reciprocal rank of the retrieved documents.
MRR measures how high the first retrieved document is ranked.
Each question can have multiple ground truth documents and multiple retrieved documents.
`DocumentMeanReciprocalRank` doesn't normalize its inputs, the `DocumentCleaner` component
should be used to clean and normalize the documents before passing them to this evaluator.
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
evaluator = DocumentMeanReciprocalRank()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
],
)
print(result["individual_scores"])
# [1.0, 0.8333333333333333]
print(result["score"])
# 0.9166666666666666
```
"""

@component.output_types(score=float, individual_scores=List[float])
def run(
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentMeanReciprocalRank on the given inputs.
`ground_truth_documents` and `retrieved_documents` must have the same length.
:param ground_truth_documents:
A list of expected documents for each question.
:param retrieved_documents:
A list of retrieved documents for each question.
:returns:
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked.
"""
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

individual_scores = []

for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
score = 0.0
for ground_document in ground_truth:
if ground_document.content is None:
continue

for rank, retrieved_document in enumerate(retrieved):
if retrieved_document.content is None:
continue

if ground_document.content in retrieved_document.content:
score = 1 / (rank + 1)
break
individual_scores.append(score)

score = sum(individual_scores) / len(retrieved_documents)

return {"score": score, "individual_scores": individual_scores}
20 changes: 7 additions & 13 deletions haystack/components/evaluators/document_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,15 @@ def from_str(string: str) -> "RecallMode":
@component
class DocumentRecallEvaluator:
"""
Evaluator that calculates the Recall score for a list of questions.
Evaluator that calculates the Recall score for a list of documents.
Returns both a list of scores for each question and the average.
Each question can have multiple ground truth documents and multiple predicted documents.
There can be multiple ground truth documents and multiple predicted documents as input.
Usage example:
```python
from haystack.components.evaluators import DocumentRecallEvaluator
evaluator = DocumentRecallEvaluator()
result = evaluator.run(
questions=["What is the capital of Germany?", "What is the capital of France?"],
ground_truth_answers=[["Berlin"], ["Paris"]],
predicted_answers=[["Paris"], ["London"]],
)
Expand Down Expand Up @@ -80,17 +79,12 @@ def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_do

@component.output_types(score=float, individual_scores=List[float])
def run(
self,
questions: List[str],
ground_truth_documents: List[List[Document]],
retrieved_documents: List[List[Document]],
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentRecallEvaluator on the given inputs.
All lists must have the same length.
`ground_truth_documents` and `retrieved_documents` must have the same length.
:param questions:
A list of questions.
:param ground_truth_documents:
A list of expected documents for each question.
:param retrieved_documents:
Expand All @@ -100,13 +94,13 @@ def run(
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
If the mode is `single_hit`, the individual scores are True or False.
"""
if not len(questions) == len(ground_truth_documents) == len(retrieved_documents):
msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same."
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

scores = []
for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
score = self.mode_function(ground_truth, retrieved)
scores.append(score)

return {"score": sum(scores) / len(questions), "individual_scores": scores}
return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores}
Loading

0 comments on commit 2336f6b

Please sign in to comment.