Merge branch 'main' into docstrings-linting

deepset-ai · Apr 23, 2024 · f7ae004 · f7ae004
2 parents 7997915 + d7638cf
commit f7ae004
Show file tree

Hide file tree

Showing 23 changed files with 477 additions and 79 deletions.
diff --git a/.github/workflows/project.yml b/.github/workflows/project.yml
@@ -10,7 +10,7 @@ jobs:
     name: Add new issues to project for triage
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/[email protected].0
+      - uses: actions/[email protected].1
         with:
           project-url: https://github.com/orgs/deepset-ai/projects/5
           github-token: ${{ secrets.GH_PROJECT_PAT }}
diff --git a/docs/pydoc/config/evaluators_api.yml b/docs/pydoc/config/evaluators_api.yml
@@ -0,0 +1,39 @@
+loaders:
+  - type: haystack_pydoc_tools.loaders.CustomPythonLoader
+    search_path: [../../../haystack/components/evaluators]
+    modules:
+      [
+        "answer_exact_match",
+        "context_relevance",
+        "document_map",
+        "document_mrr",
+        "document_recall",
+        "evaluation_result",
+        "document_recall",
+        "faithfulness",
+        "llm_evaluator",
+        "sas_evaluator",
+      ]
+    ignore_when_discovered: ["__init__"]
+processors:
+  - type: filter
+    expression:
+    documented_only: true
+    do_not_filter_modules: false
+    skip_empty_modules: true
+  - type: smart
+  - type: crossref
+renderer:
+  type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
+  excerpt: Evaluate your pipelines or individual components.
+  category_slug: haystack-api
+  title: Evaluators
+  slug: evaluators-api
+  order: 5
+  markdown:
+    descriptive_class_title: false
+    classdef_code_block: false
+    descriptive_module_title: true
+    add_method_class_prefix: true
+    add_member_class_prefix: false
+    filename: evaluators_api.md
diff --git a/haystack/components/caching/cache_checker.py b/haystack/components/caching/cache_checker.py
@@ -76,7 +76,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "CacheChecker":
 
         try:
             module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
-            logger.debug("Trying to import module '{module}'", module=module_name)
+            logger.debug("Trying to import module '{module_name}'", module_name=module_name)
             module = importlib.import_module(module_name)
         except (ImportError, DeserializationError) as e:
             raise DeserializationError(

diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py
@@ -1,4 +1,5 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
+from .context_relevance import ContextRelevanceEvaluator
 from .document_map import DocumentMAPEvaluator
 from .document_mrr import DocumentMRREvaluator
 from .document_recall import DocumentRecallEvaluator
@@ -9,6 +10,7 @@
 
 __all__ = [
     "AnswerExactMatchEvaluator",
+    "ContextRelevanceEvaluator",
     "DocumentMAPEvaluator",
     "DocumentMRREvaluator",
     "DocumentRecallEvaluator",

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
@@ -0,0 +1,154 @@
+from typing import Any, Dict, List, Optional
+
+from numpy import mean as np_mean
+
+from haystack import default_from_dict
+from haystack.components.evaluators.llm_evaluator import LLMEvaluator
+from haystack.core.component import component
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+# Private global variable for default examples to include in the prompt if the user does not provide any examples
+_DEFAULT_EXAMPLES = [
+    {
+        "inputs": {
+            "questions": "What is the capital of Germany?",
+            "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+        },
+        "outputs": {
+            "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+            "statement_scores": [1, 0],
+        },
+    },
+    {
+        "inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]},
+        "outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
+    },
+    {
+        "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
+        "outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
+    },
+]
+
+
+class ContextRelevanceEvaluator(LLMEvaluator):
+    """
+    Evaluator that checks if a provided context is relevant to the question.
+
+    An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
+    context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
+    statements that can be inferred from the provided contexts.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import ContextRelevanceEvaluator
+
+    questions = ["Who created the Python language?"]
+    contexts = [
+        [
+            "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+        ],
+    ]
+
+    evaluator = ContextRelevanceEvaluator()
+    result = evaluator.run(questions=questions, contexts=contexts)
+    print(result["score"])
+    # 1.0
+    print(result["individual_scores"])
+    # [1.0]
+    print(result["results"])
+    # [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}]
+    ```
+    """
+
+    def __init__(
+        self,
+        examples: Optional[List[Dict[str, Any]]] = None,
+        api: str = "openai",
+        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+    ):
+        """
+        Creates an instance of ContextRelevanceEvaluator.
+
+        :param examples:
+            Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator.
+            Default examples will be used if none are provided.
+            Each example must be a dictionary with keys "inputs" and "outputs".
+            "inputs" must be a dictionary with keys "questions" and "contexts".
+            "outputs" must be a dictionary with "statements" and "statement_scores".
+            Expected format:
+            [{
+                "inputs": {
+                    "questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
+                },
+                "outputs": {
+                    "statements": ["Rome is the capital of Italy."],
+                    "statement_scores": [1],
+                },
+            }]
+        :param api:
+            The API to use for calling an LLM through a Generator.
+            Supported APIs: "openai".
+        :param api_key:
+            The API key.
+
+        """
+        self.instructions = (
+            "Your task is to judge how relevant the provided context is for answering a question. "
+            "First, please extract statements from the provided context. "
+            "Second, calculate a relevance score for each statement in the context. "
+            "The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
+        )
+        self.inputs = [("questions", List[str]), ("contexts", List[List[str]])]
+        self.outputs = ["statements", "statement_scores"]
+        self.examples = examples or _DEFAULT_EXAMPLES
+        self.api = api
+        self.api_key = api_key
+
+        super().__init__(
+            instructions=self.instructions,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            examples=self.examples,
+            api=self.api,
+            api_key=self.api_key,
+        )
+
+    @component.output_types(results=List[Dict[str, Any]])
+    def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
+        """
+        Run the LLM evaluator.
+
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A list of lists of contexts. Each list of contexts corresponds to one question.
+        :returns:
+            A dictionary with the following outputs:
+                - `score`: Mean context relevance score over all the provided input questions.
+                - `individual_scores`: A list of context relevance scores for each input question.
+                - `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
+        """
+        result = super().run(questions=questions, contexts=contexts)
+
+        # calculate average statement relevance score per query
+        for res in result["results"]:
+            res["score"] = np_mean(res["statement_scores"])
+
+        # calculate average context relevance score over all queries
+        result["score"] = np_mean([res["score"] for res in result["results"]])
+        result["individual_scores"] = [res["score"] for res in result["results"]]
+
+        return result
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data:
+            The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
@@ -7,6 +7,40 @@
 from haystack.core.component import component
 from haystack.utils import Secret, deserialize_secrets_inplace
 
+# Default examples to include in the prompt if the user does not provide any examples
+_DEFAULT_EXAMPLES = [
+    {
+        "inputs": {
+            "questions": "What is the capital of Germany and when was it founded?",
+            "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+            "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+        },
+        "outputs": {
+            "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+            "statement_scores": [1, 1],
+        },
+    },
+    {
+        "inputs": {
+            "questions": "What is the capital of France?",
+            "contexts": ["Berlin is the capital of Germany."],
+            "responses": "Paris",
+        },
+        "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
+    },
+    {
+        "inputs": {
+            "questions": "What is the capital of Italy?",
+            "contexts": ["Rome is the capital of Italy."],
+            "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+        },
+        "outputs": {
+            "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
+            "statement_scores": [1, 0],
+        },
+    },
+]
+
 
 class FaithfulnessEvaluator(LLMEvaluator):
     """
@@ -50,7 +84,8 @@ def __init__(
         Creates an instance of FaithfulnessEvaluator.
 
         :param examples:
-            Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
+            Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
+            Default examples will be used if none are provided.
             Each example must be a dictionary with keys "inputs" and "outputs".
             "inputs" must be a dictionary with keys "questions", "contexts", and "responses".
             "outputs" must be a dictionary with "statements" and "statement_scores".
@@ -81,38 +116,7 @@ def __init__(
         )
         self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
         self.outputs = ["statements", "statement_scores"]
-        self.examples = examples or [
-            {
-                "inputs": {
-                    "questions": "What is the capital of Germany and when was it founded?",
-                    "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
-                    "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
-                },
-                "outputs": {
-                    "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
-                    "statement_scores": [1, 1],
-                },
-            },
-            {
-                "inputs": {
-                    "questions": "What is the capital of France?",
-                    "contexts": ["Berlin is the capital of Germany."],
-                    "responses": "Paris",
-                },
-                "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
-            },
-            {
-                "inputs": {
-                    "questions": "What is the capital of Italy?",
-                    "contexts": ["Rome is the capital of Italy."],
-                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
-                },
-                "outputs": {
-                    "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
-                    "statement_scores": [1, 0],
-                },
-            },
-        ]
+        self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
 
@@ -126,19 +130,23 @@ def __init__(
         )
 
     @component.output_types(results=List[Dict[str, Any]])
-    def run(self, **inputs) -> Dict[str, Any]:
+    def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
-        :param inputs:
-            The input values to evaluate. The keys are the input names and the values are lists of input values.
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A nested list of contexts that correspond to the questions.
+        :param responses:
+            A list of responses.
         :returns:
             A dictionary with the following outputs:
                 - `score`: Mean faithfulness score over all the provided input answers.
                 - `individual_scores`: A list of faithfulness scores for each input answer.
                 - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
         """
-        result = super().run(**inputs)
+        result = super().run(questions=questions, contexts=contexts, responses=responses)
 
         # calculate average statement faithfulness score per query
         for res in result["results"]:

diff --git a/haystack/components/retrievers/filter_retriever.py b/haystack/components/retrievers/filter_retriever.py
@@ -79,7 +79,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "FilterRetriever":
             raise DeserializationError("Missing 'type' in document store's serialization data")
         try:
             module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
-            logger.debug("Trying to import module '{module}'", module=module_name)
+            logger.debug("Trying to import module '{module_name}'", module_name=module_name)
             module = importlib.import_module(module_name)
         except (ImportError, DeserializationError) as e:
             raise DeserializationError(

diff --git a/haystack/components/writers/document_writer.py b/haystack/components/writers/document_writer.py
@@ -76,7 +76,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentWriter":
 
         try:
             module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
-            logger.debug("Trying to import module '{module}'", module=module_name)
+            logger.debug("Trying to import module '{module_name}'", module_name=module_name)
             module = importlib.import_module(module_name)
         except (ImportError, DeserializationError) as e:
             raise DeserializationError(

diff --git a/haystack/core/component/component.py b/haystack/core/component/component.py
@@ -411,10 +411,10 @@ def copy_class_namespace(namespace):
         if class_path in self.registry:
             # Corner case, but it may occur easily in notebooks when re-running cells.
             logger.debug(
-                "Component {component} is already registered. Previous imported from '{module}', new imported from '{new_module}'",
+                "Component {component} is already registered. Previous imported from '{module_name}', new imported from '{new_module_name}'",
                 component=class_path,
-                module=self.registry[class_path],
-                new_module=cls,
+                module_name=self.registry[class_path],
+                new_module_name=cls,
             )
         self.registry[class_path] = cls
         logger.debug("Registered Component {component}", component=cls)