Merge branch 'main' into docstrings-linting

deepset-ai · Apr 4, 2024 · 2336f6b · 2336f6b
2 parents a1215f7 + bf8453e
commit 2336f6b
Show file tree

Hide file tree

Showing 14 changed files with 755 additions and 131 deletions.
diff --git a/haystack/components/evaluators/answer_exact_match.py b/haystack/components/evaluators/answer_exact_match.py
@@ -7,17 +7,16 @@
 class AnswerExactMatchEvaluator:
     """
     Evaluator that checks if the predicted answers matches any of the ground truth answers exactly.
-    The result is a number from 0.0 to 1.0, it represents the proportion of questions where any predicted answer
-    matched one of the ground truth answers.
-    Each question can have multiple ground truth answers and multiple predicted answers.
+    The result is a number from 0.0 to 1.0, it represents the proportion any predicted answer
+    that matched one of the ground truth answers.
+    There can be multiple ground truth answers and multiple predicted answers as input.
 
     Usage example:
     ```python
     from haystack.components.evaluators import AnswerExactMatchEvaluator
 
     evaluator = AnswerExactMatchEvaluator()
     result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
         ground_truth_answers=[["Berlin"], ["Paris"]],
         predicted_answers=[["Berlin"], ["Lyon"]],
     )
@@ -30,15 +29,11 @@ class AnswerExactMatchEvaluator:
     """
 
     @component.output_types(individual_scores=List[int], score=float)
-    def run(
-        self, questions: List[str], ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]
-    ) -> Dict[str, Any]:
+    def run(self, ground_truth_answers: List[List[str]], predicted_answers: List[List[str]]) -> Dict[str, Any]:
         """
         Run the AnswerExactMatchEvaluator on the given inputs.
-        All lists must have the same length.
+        `ground_truth_answers` and `retrieved_answers` must have the same length.
 
-        :param questions:
-            A list of questions.
         :param ground_truth_answers:
             A list of expected answers for each question.
         :param predicted_answers:
@@ -49,8 +44,8 @@ def run(
             - `score` - A number from 0.0 to 1.0 that represents the proportion of questions where any predicted
                          answer matched one of the ground truth answers.
         """
-        if not len(questions) == len(ground_truth_answers) == len(predicted_answers):
-            raise ValueError("The length of questions, ground_truth_answers, and predicted_answers must be the same.")
+        if not len(ground_truth_answers) == len(predicted_answers):
+            raise ValueError("The length of ground_truth_answers and predicted_answers must be the same.")
 
         matches = []
         for truths, extracted in zip(ground_truth_answers, predicted_answers):
@@ -60,6 +55,6 @@ def run(
                 matches.append(0)
 
         # The proportion of questions where any predicted answer matched one of the ground truth answers
-        average = sum(matches) / len(questions)
+        average = sum(matches) / len(predicted_answers)
 
         return {"individual_scores": matches, "score": average}
diff --git a/haystack/components/evaluators/document_map.py b/haystack/components/evaluators/document_map.py
@@ -0,0 +1,84 @@
+from typing import Any, Dict, List
+
+from haystack import Document, component
+
+
+@component
+class DocumentMeanAveragePrecision:
+    """
+    Evaluator that calculates the mean average precision of the retrieved documents, a metric
+    that measures how high retrieved documents are ranked.
+    Each question can have multiple ground truth documents and multiple retrieved documents.
+
+    `DocumentMeanAveragePrecision` doesn't normalize its inputs, the `DocumentCleaner` component
+    should be used to clean and normalize the documents before passing them to this evaluator.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import AnswerExactMatchEvaluator
+
+    evaluator = DocumentMeanAveragePrecision()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
+    )
+
+    print(result["individual_scores"])
+    # [1.0, 0.8333333333333333]
+    print(result["score"])
+    # 0.9166666666666666
+    ```
+    """
+
+    @component.output_types(score=float, individual_scores=List[float])
+    def run(
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
+    ) -> Dict[str, Any]:
+        """
+        Run the DocumentMeanAveragePrecision on the given inputs.
+        All lists must have the same length.
+
+        :param ground_truth_documents:
+            A list of expected documents for each question.
+        :param retrieved_documents:
+            A list of retrieved documents for each question.
+        :returns:
+            A dictionary with the following outputs:
+            - `score` - The average of calculated scores.
+            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
+        """
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
+            raise ValueError(msg)
+
+        individual_scores = []
+
+        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
+            score = 0.0
+            for ground_document in ground_truth:
+                if ground_document.content is None:
+                    continue
+
+                average_precision = 0.0
+                relevant_documents = 0
+
+                for rank, retrieved_document in enumerate(retrieved):
+                    if retrieved_document.content is None:
+                        continue
+
+                    if ground_document.content in retrieved_document.content:
+                        relevant_documents += 1
+                        average_precision += relevant_documents / (rank + 1)
+                if relevant_documents > 0:
+                    score = average_precision / relevant_documents
+            individual_scores.append(score)
+
+        score = sum(individual_scores) / len(retrieved_documents)
+
+        return {"score": score, "individual_scores": individual_scores}
diff --git a/haystack/components/evaluators/document_mrr.py b/haystack/components/evaluators/document_mrr.py
@@ -0,0 +1,79 @@
+from typing import Any, Dict, List
+
+from haystack import Document, component
+
+
+@component
+class DocumentMeanReciprocalRank:
+    """
+    Evaluator that calculates the mean reciprocal rank of the retrieved documents.
+
+    MRR measures how high the first retrieved document is ranked.
+    Each question can have multiple ground truth documents and multiple retrieved documents.
+
+    `DocumentMeanReciprocalRank` doesn't normalize its inputs, the `DocumentCleaner` component
+    should be used to clean and normalize the documents before passing them to this evaluator.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import AnswerExactMatchEvaluator
+    evaluator = DocumentMeanReciprocalRank()
+    result = evaluator.run(
+        ground_truth_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="9th")],
+        ],
+        retrieved_documents=[
+            [Document(content="France")],
+            [Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
+        ],
+    )
+    print(result["individual_scores"])
+    # [1.0, 0.8333333333333333]
+    print(result["score"])
+    # 0.9166666666666666
+    ```
+    """
+
+    @component.output_types(score=float, individual_scores=List[float])
+    def run(
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
+    ) -> Dict[str, Any]:
+        """
+        Run the DocumentMeanReciprocalRank on the given inputs.
+
+        `ground_truth_documents` and `retrieved_documents` must have the same length.
+
+        :param ground_truth_documents:
+            A list of expected documents for each question.
+        :param retrieved_documents:
+            A list of retrieved documents for each question.
+        :returns:
+            A dictionary with the following outputs:
+            - `score` - The average of calculated scores.
+            - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high the first retrieved document is ranked.
+        """
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
+            raise ValueError(msg)
+
+        individual_scores = []
+
+        for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
+            score = 0.0
+            for ground_document in ground_truth:
+                if ground_document.content is None:
+                    continue
+
+                for rank, retrieved_document in enumerate(retrieved):
+                    if retrieved_document.content is None:
+                        continue
+
+                    if ground_document.content in retrieved_document.content:
+                        score = 1 / (rank + 1)
+                        break
+            individual_scores.append(score)
+
+        score = sum(individual_scores) / len(retrieved_documents)
+
+        return {"score": score, "individual_scores": individual_scores}
diff --git a/haystack/components/evaluators/document_recall.py b/haystack/components/evaluators/document_recall.py
@@ -31,16 +31,15 @@ def from_str(string: str) -> "RecallMode":
 @component
 class DocumentRecallEvaluator:
     """
-    Evaluator that calculates the Recall score for a list of questions.
+    Evaluator that calculates the Recall score for a list of documents.
     Returns both a list of scores for each question and the average.
-    Each question can have multiple ground truth documents and multiple predicted documents.
+    There can be multiple ground truth documents and multiple predicted documents as input.
 
     Usage example:
     ```python
     from haystack.components.evaluators import DocumentRecallEvaluator
     evaluator = DocumentRecallEvaluator()
     result = evaluator.run(
-        questions=["What is the capital of Germany?", "What is the capital of France?"],
         ground_truth_answers=[["Berlin"], ["Paris"]],
         predicted_answers=[["Paris"], ["London"]],
     )
@@ -80,17 +79,12 @@ def _recall_multi_hit(self, ground_truth_documents: List[Document], retrieved_do
 
     @component.output_types(score=float, individual_scores=List[float])
     def run(
-        self,
-        questions: List[str],
-        ground_truth_documents: List[List[Document]],
-        retrieved_documents: List[List[Document]],
+        self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
     ) -> Dict[str, Any]:
         """
         Run the DocumentRecallEvaluator on the given inputs.
-        All lists must have the same length.
+        `ground_truth_documents` and `retrieved_documents` must have the same length.
 
-        :param questions:
-            A list of questions.
         :param ground_truth_documents:
             A list of expected documents for each question.
         :param retrieved_documents:
@@ -100,13 +94,13 @@ def run(
             - `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents the proportion of matching documents retrieved.
                                     If the mode is `single_hit`, the individual scores are True or False.
         """
-        if not len(questions) == len(ground_truth_documents) == len(retrieved_documents):
-            msg = "The length of questions, ground_truth_documents, and predicted_documents must be the same."
+        if len(ground_truth_documents) != len(retrieved_documents):
+            msg = "The length of ground_truth_documents and retrieved_documents must be the same."
             raise ValueError(msg)
 
         scores = []
         for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
             score = self.mode_function(ground_truth, retrieved)
             scores.append(score)
 
-        return {"score": sum(scores) / len(questions), "individual_scores": scores}
+        return {"score": sum(scores) / len(retrieved_documents), "individual_scores": scores}