From 60eab3c13bf4697beae6eb4512bcd88238ba41bb Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Mon, 26 Feb 2024 14:51:54 +0100
Subject: [PATCH 01/17] draft llm evaluator

---
 haystack/components/evaluators/__init__.py    |   3 +-
 haystack/components/evaluators/example.py     |  36 ++++++
 .../components/evaluators/llm_evaluator.py    | 106 ++++++++++++++++++
 3 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 haystack/components/evaluators/example.py
 create mode 100644 haystack/components/evaluators/llm_evaluator.py

diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py
index 9550a5f42d..88e78e5756 100644
--- a/haystack/components/evaluators/__init__.py
+++ b/haystack/components/evaluators/__init__.py
@@ -1,3 +1,4 @@
 from .answer_exact_match import AnswerExactMatchEvaluator
+from .llm_evaluator import LLMEvaluator
 
-__all__ = ["AnswerExactMatchEvaluator"]
+__all__ = ["AnswerExactMatchEvaluator", "LLMEvaluator"]
diff --git a/haystack/components/evaluators/example.py b/haystack/components/evaluators/example.py
new file mode 100644
index 0000000000..6803670642
--- /dev/null
+++ b/haystack/components/evaluators/example.py
@@ -0,0 +1,36 @@
+# A valid OpenAI API key must be provided as an environment variable "OPENAI_API_KEY" to run this example.
+from typing import List
+
+from haystack import Pipeline
+
+from haystack.components.evaluators import LLMEvaluator
+
+QUESTIONS = ["Which is the most popular global sport?", "Who created the Python language?"]
+CONTEXTS = [
+    [
+        "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
+    ],
+    [
+        "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+    ],
+]
+RESPONSES = [
+    "Football is the most popular sport with around 4 billion followers worldwide",
+    "Python language was created by Guido van Rossum.",
+]
+
+GROUND_TRUTHS = ["Football is the most popular sport", "Python language was created by Guido van Rossum."]
+
+pipeline = Pipeline()
+evaluator = LLMEvaluator(
+    api="openai",
+    inputs=[("responses", List[str])],
+    outputs=["score"],
+    instruction="Is this answer problematic for children?",
+)
+pipeline.add_component("evaluator", evaluator)
+
+results = pipeline.run({"evaluator": {"responses": RESPONSES}})
+
+for output in results["evaluator"]["results"]:
+    print(output)
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
new file mode 100644
index 0000000000..8bada84101
--- /dev/null
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -0,0 +1,106 @@
+from typing import Any, Dict, List, Tuple, Type
+
+from haystack import component, default_from_dict, default_to_dict
+from haystack.components.builders import PromptBuilder
+from haystack.components.generators import OpenAIGenerator
+from haystack.utils import Secret
+
+
+@component
+class LLMEvaluator:
+    """
+    A component that uses an LLM to evaluate inputs against a specific metric.
+
+    Most of them require an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
+    The inputs of the component are metric-dependent.
+    The output is a nested list of evaluation results where each inner list contains the results for a single input.
+    """
+
+    def __init__(
+        self,
+        instruction: str,
+        inputs: List[Tuple[str, Type]],
+        outputs: List[str],
+        *,
+        api: str = "openai",
+        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+    ):
+        """
+        Construct a new LLM evaluator.
+
+        :param api:
+            The API to use for evaluation.
+
+            Supported APIs: "openai".
+        :param api_key:
+            The API key to use.
+        """
+
+        self.instruction = instruction
+        self.inputs = inputs
+        self.outputs = outputs
+        self.api = api
+        self.api_key = api_key
+        expected_inputs = dict(inputs)
+        if api == "openai":
+            self.generator = OpenAIGenerator(api_key=api_key)
+
+        component.set_input_types(self, **expected_inputs)
+
+    @component.output_types(results=List[List[Dict[str, Any]]])
+    def run(self, **inputs) -> Dict[str, Any]:
+        """
+        Run the LLM evaluator.
+
+        Example:
+        ```python
+        p = Pipeline()
+        evaluator = LLMEvaluator(
+            api = "openai",
+        )
+        p.add_component("evaluator", evaluator)
+
+        results = p.run({"evaluator": {"questions": QUESTIONS, "contexts": CONTEXTS, "ground_truths": GROUND_TRUTHS}})
+        ```
+
+        :param inputs:
+            The inputs to evaluate. These are determined by the
+            metric being calculated. See :class:`RagasMetric` for more
+            information.
+        :returns:
+            A nested list of metric results. Each input can have one or more
+            results, depending on the metric. Each result is a dictionary
+            containing the following keys and values:
+                * `name` - The name of the metric.
+                * `score` - The score of the metric.
+        """
+        # TODO: validate input parameters
+        # InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs)
+
+        results = []
+        for input_socket in self.inputs:
+            self.instruction += f"{input_socket[0]}: {{{{ input_socket[1] }}}}"  # fix: do not hardcode
+        builder = PromptBuilder(template=self.instruction)
+        for response in inputs["responses"]:  # fix: do not hardcode
+            prompt = builder.run(response=response)
+            result = self.generator.run(prompt=prompt["prompt"])
+            results.append(result["replies"])
+        # todo: convert result list
+        return {"results": [{"name": "llm", "score": 1.0}]}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+
+        return default_to_dict(self, instruction=self.instruction, api=self.api, api_key=self.api_key.to_dict())
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
+        """
+        Deserialize a component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        """
+        return default_from_dict(cls, data)

From 795806befce848cd7916e757d6190651b0695ec0 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Wed, 20 Mar 2024 11:51:08 +0100
Subject: [PATCH 02/17] docstrings

---
 .../components/evaluators/llm_evaluator.py    | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 8bada84101..d0761f25f5 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -3,7 +3,7 @@
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder
 from haystack.components.generators import OpenAIGenerator
-from haystack.utils import Secret
+from haystack.utils import Secret, deserialize_secrets_inplace
 
 
 @component
@@ -26,7 +26,7 @@ def __init__(
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
     ):
         """
-        Construct a new LLM evaluator.
+        Creates an instance of LLMEvaluator.
 
         :param api:
             The API to use for evaluation.
@@ -88,19 +88,30 @@ def run(self, **inputs) -> Dict[str, Any]:
         # todo: convert result list
         return {"results": [{"name": "llm", "score": 1.0}]}
 
+    def _get_telemetry_data(self) -> Dict[str, Any]:
+        """
+        Data that is sent to Posthog for usage analytics.
+        """
+        return {"api": self.api}
+
     def to_dict(self) -> Dict[str, Any]:
         """
         Serialize this component to a dictionary.
-        """
 
+        :returns:
+            The serialized component as a dictionary.
+        """
         return default_to_dict(self, instruction=self.instruction, api=self.api, api_key=self.api_key.to_dict())
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
         """
-        Deserialize a component from a dictionary.
+        Deserialize this component from a dictionary.
 
         :param data:
-            The dictionary to deserialize from.
+            The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
         """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)

From 521c80aef077eb4a7282bdd903148a383c8df053 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 17:53:03 +0100
Subject: [PATCH 03/17] flexible inputs; validate inputs and outputs

---
 haystack/components/evaluators/example.py     |   7 +-
 .../components/evaluators/llm_evaluator.py    | 161 ++++++++++++++----
 2 files changed, 137 insertions(+), 31 deletions(-)

diff --git a/haystack/components/evaluators/example.py b/haystack/components/evaluators/example.py
index 6803670642..4e9cffbd24 100644
--- a/haystack/components/evaluators/example.py
+++ b/haystack/components/evaluators/example.py
@@ -2,7 +2,6 @@
 from typing import List
 
 from haystack import Pipeline
-
 from haystack.components.evaluators import LLMEvaluator
 
 QUESTIONS = ["Which is the most popular global sport?", "Who created the Python language?"]
@@ -26,7 +25,11 @@
     api="openai",
     inputs=[("responses", List[str])],
     outputs=["score"],
-    instruction="Is this answer problematic for children?",
+    instructions="Is this answer problematic for children?",
+    examples=[
+        {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+        {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+    ],
 )
 pipeline.add_component("evaluator", evaluator)
 
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index d0761f25f5..4cbfdf7b53 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -1,4 +1,5 @@
-from typing import Any, Dict, List, Tuple, Type
+import json
+from typing import Any, Dict, List, Optional, Tuple, Type
 
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder
@@ -9,41 +10,68 @@
 @component
 class LLMEvaluator:
     """
-    A component that uses an LLM to evaluate inputs against a specific metric.
+    Uses an LLM to evaluate inputs based on provided instructions and examples.
 
-    Most of them require an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
+    The default api requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
     The inputs of the component are metric-dependent.
-    The output is a nested list of evaluation results where each inner list contains the results for a single input.
+    The output is a dictionary with a key `results` containing a list of evaluation results.
+
+    Usage example:
+    ```python
+    from haystack.components.evaluators import LLMEvaluator
+    evaluator = LLMEvaluator(
+        instructions="Is this answer problematic for children?",
+        inputs=[("responses", List[str])],
+        outputs=["score"],
+    )
+    RESPONSES = [
+        "Football is the most popular sport with around 4 billion followers worldwide",
+        "Python language was created by Guido van Rossum.",
+    ]
+    results = evaluator.run(responses=RESPONSES)
+    ```
     """
 
     def __init__(
         self,
-        instruction: str,
+        instructions: str,
         inputs: List[Tuple[str, Type]],
         outputs: List[str],
         *,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
+        examples: Optional[List[Dict[str, Any]]] = None,
     ):
         """
         Creates an instance of LLMEvaluator.
 
+        :param instructions:
+            The prompt instructions to use for evaluation.
+        :param inputs:
+            The inputs to use for evaluation. Each input is a tuple containing
+            the name of the input and the type of the input.
+        :param outputs:
+            The output names of the evaluation results.
         :param api:
             The API to use for evaluation.
-
             Supported APIs: "openai".
         :param api_key:
             The API key to use.
+        :param examples:
+            Few-shot examples conforming to the input and output format.
         """
 
-        self.instruction = instruction
+        self.instructions = instructions
         self.inputs = inputs
         self.outputs = outputs
         self.api = api
         self.api_key = api_key
+        self.examples = examples
         expected_inputs = dict(inputs)
         if api == "openai":
             self.generator = OpenAIGenerator(api_key=api_key)
+        else:
+            raise ValueError(f"Unsupported API: {api}")
 
         component.set_input_types(self, **expected_inputs)
 
@@ -52,17 +80,6 @@ def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
-        Example:
-        ```python
-        p = Pipeline()
-        evaluator = LLMEvaluator(
-            api = "openai",
-        )
-        p.add_component("evaluator", evaluator)
-
-        results = p.run({"evaluator": {"questions": QUESTIONS, "contexts": CONTEXTS, "ground_truths": GROUND_TRUTHS}})
-        ```
-
         :param inputs:
             The inputs to evaluate. These are determined by the
             metric being calculated. See :class:`RagasMetric` for more
@@ -74,19 +91,46 @@ def run(self, **inputs) -> Dict[str, Any]:
                 * `name` - The name of the metric.
                 * `score` - The score of the metric.
         """
-        # TODO: validate input parameters
-        # InputConverters.validate_input_parameters(self.metric, self.descriptor.input_parameters, inputs)
+        self.validate_input_parameters(dict(self.inputs), inputs)
+        self.validate_lengths(*inputs.values())
 
         results = []
-        for input_socket in self.inputs:
-            self.instruction += f"{input_socket[0]}: {{{{ input_socket[1] }}}}"  # fix: do not hardcode
-        builder = PromptBuilder(template=self.instruction)
-        for response in inputs["responses"]:  # fix: do not hardcode
-            prompt = builder.run(response=response)
+        template = self.prepare_template()
+        builder = PromptBuilder(template=template)
+
+        # inputs is a dictionary with keys being input names and values being a list of input values
+        # We need to iterate through the lists in parallel for all keys of the dictionary
+        input_names, values = inputs.keys(), list(zip(*inputs.values()))
+        list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
+
+        for input_names_to_values in list_of_input_names_to_values:
+            prompt = builder.run(**input_names_to_values)
+            # TODO rendered prompt should contain " instead of ' for filled in values such as responses.
+            #  and for strings it should contain " instead of currently no delimiters
+            #  json.dumps() instead of str() should be used
             result = self.generator.run(prompt=prompt["prompt"])
-            results.append(result["replies"])
-        # todo: convert result list
-        return {"results": [{"name": "llm", "score": 1.0}]}
+
+            self.validate_outputs(expected=self.outputs, received=result["replies"][0])
+            parsed_result = json.loads(result["replies"][0])
+            parsed_result["name"] = "llm"
+            results.append(parsed_result)
+
+        return {"results": results}
+
+    def prepare_template(self) -> str:
+        """
+        Combine instructions, inputs, outputs, and examples into one prompt template.
+        """
+        inputs_section = (
+            "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
+        )
+        examples_section = ""
+        if self.examples:
+            for example in self.examples:
+                examples_section += (
+                    "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"]) + "\n"
+                )
+        return f"Respond only in JSON format with a key {json.dumps(self.outputs)} and a value of either 0 for FALSE or 1 for TRUE.\n{self.instructions}\n{examples_section}Inputs:\n{inputs_section}\nOutputs:\n"
 
     def _get_telemetry_data(self) -> Dict[str, Any]:
         """
@@ -101,7 +145,15 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             The serialized component as a dictionary.
         """
-        return default_to_dict(self, instruction=self.instruction, api=self.api, api_key=self.api_key.to_dict())
+        return default_to_dict(
+            self,
+            instructions=self.instructions,
+            inputs=self.inputs,
+            outputs=self.outputs,
+            api=self.api,
+            api_key=self.api_key.to_dict(),
+            examples=self.examples,
+        )
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
@@ -115,3 +167,54 @@ def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
+
+    @staticmethod
+    def validate_lengths(*lists):
+        """
+        Validate that all input lists have the same length.
+
+        :param lists:
+            The lists to validate.
+        """
+        length = len(lists[0])
+        if all(len(lst) == length for lst in lists[1:]):
+            return True
+        else:
+            msg = f"LLM evaluator expects all input lists to have the same length but received {lists} with lengths {[len(lst) for lst in lists]}."
+            raise ValueError(msg)
+
+    @staticmethod
+    def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None:
+        """
+        Validate the input parameters.
+
+        :param expected:
+            The expected input parameters.
+        :param received:
+            The received input parameters.
+
+        :raises ValueError:
+            If not all expected inputs are present in the received inputs
+        """
+        for param in expected.keys():
+            if param not in received:
+                msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}."
+                raise ValueError(msg)
+
+    @staticmethod
+    def validate_outputs(expected: List[str], received: str) -> None:
+        """
+        Validate the output.
+
+        :param expected:
+            Names of expected outputs
+        :param received:
+            Names of received outputs
+
+        :raises ValueError:
+            If not all expected outputs are present in the received outputs
+        """
+        parsed_output = json.loads(received)
+        if not all(output in parsed_output for output in expected):
+            msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}."
+            raise ValueError(msg)

From 3bf1ab870cd53e967e2bbb58ca57c94b4e698f6a Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 17:54:01 +0100
Subject: [PATCH 04/17] add tests

---
 .../evaluators/test_llm_evaluator.py          | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 test/components/evaluators/test_llm_evaluator.py

diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
new file mode 100644
index 0000000000..78527b1075
--- /dev/null
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -0,0 +1,169 @@
+import os
+from typing import List
+
+import pytest
+
+from haystack.components.evaluators import LLMEvaluator
+from haystack.utils.auth import Secret
+
+
+class TestLLMEvaluator:
+    def test_init_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["score"]
+        assert component.examples == None
+
+    def test_init_fail_wo_openai_api_key(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="None of the .* environment variables are set"):
+            LLMEvaluator(
+                api="openai", instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"]
+            )
+
+    def test_init_with_parameters(self):
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            api_key=Secret.from_token("test-api-key"),
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            api="openai",
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            ],
+        )
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.api == "openai"
+        assert component.examples == [
+            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+        ]
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["score"]
+
+    def test_to_dict_default(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        data = component.to_dict()
+        assert data == {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["score"],
+                "examples": None,
+            },
+        }
+
+    def test_to_dict_with_parameters(self, monkeypatch):
+        monkeypatch.setenv("ENV_VAR", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            api_key=Secret.from_env_var("ENV_VAR"),
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            api="openai",
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            ],
+        )
+        data = component.to_dict()
+        assert data == {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["score"],
+                "examples": [
+                    {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                    {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+                ],
+            },
+        }
+
+    def test_run_with_different_lengths(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            outputs=["score"],
+        )
+
+        def generator_run(self, *args, **kwargs):
+            return {"replies": [{"score": 0.5}]}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        with pytest.raises(ValueError):
+            component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]])
+
+        with pytest.raises(ValueError):
+            component.run(
+                questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
+            )
+
+    def test_prepare_template_wo_examples(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        template = component.prepare_template()
+        assert (
+            template
+            == 'Respond only in JSON format with a key ["score"] and a value of either 0 for FALSE or 1 for TRUE.\ntest-instruction\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+        )
+
+    def test_prepare_template_with_examples(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            ],
+        )
+        template = component.prepare_template()
+        assert (
+            template
+            == 'Respond only in JSON format with a key ["score"] and a value of either 0 for FALSE or 1 for TRUE.\ntest-instruction\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+        )
+
+    def test_invalid_input_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
+
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(
+                expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]}
+            )
+
+    def test_invalid_outputs(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        with pytest.raises(ValueError):
+            component.validate_outputs(expected=["score", "another_expected_output"], received="{'score': 1.0}")
+
+        with pytest.raises(ValueError):
+            component.validate_outputs(expected=["score"], received="{'wrong_name': 1.0}")
+
+    def test_unsupported_api(self):
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                api="unsupported_api",
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+            )

From d0f971566bab1b999a6e31c06081876f602ae7ea Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 17:57:53 +0100
Subject: [PATCH 05/17] add release note

---
 releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml

diff --git a/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml b/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml
new file mode 100644
index 0000000000..3b68f16a33
--- /dev/null
+++ b/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add a new LLMEvaluator component that leverages LLMs through the OpenAI api to evaluate pipelines.

From b42703b5427ceb1682c684735a1128c656742945 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 21:17:33 +0100
Subject: [PATCH 06/17] remove example

---
 haystack/components/evaluators/example.py | 39 -----------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 haystack/components/evaluators/example.py

diff --git a/haystack/components/evaluators/example.py b/haystack/components/evaluators/example.py
deleted file mode 100644
index 4e9cffbd24..0000000000
--- a/haystack/components/evaluators/example.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# A valid OpenAI API key must be provided as an environment variable "OPENAI_API_KEY" to run this example.
-from typing import List
-
-from haystack import Pipeline
-from haystack.components.evaluators import LLMEvaluator
-
-QUESTIONS = ["Which is the most popular global sport?", "Who created the Python language?"]
-CONTEXTS = [
-    [
-        "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
-    ],
-    [
-        "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
-    ],
-]
-RESPONSES = [
-    "Football is the most popular sport with around 4 billion followers worldwide",
-    "Python language was created by Guido van Rossum.",
-]
-
-GROUND_TRUTHS = ["Football is the most popular sport", "Python language was created by Guido van Rossum."]
-
-pipeline = Pipeline()
-evaluator = LLMEvaluator(
-    api="openai",
-    inputs=[("responses", List[str])],
-    outputs=["score"],
-    instructions="Is this answer problematic for children?",
-    examples=[
-        {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-        {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
-    ],
-)
-pipeline.add_component("evaluator", evaluator)
-
-results = pipeline.run({"evaluator": {"responses": RESPONSES}})
-
-for output in results["evaluator"]["results"]:
-    print(output)

From 6a216cab220cc410bcbc5bc5dee2cc50065a0532 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 22:02:10 +0100
Subject: [PATCH 07/17] docstrings

---
 .../components/evaluators/llm_evaluator.py    | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 4cbfdf7b53..c4c444be71 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -10,11 +10,12 @@
 @component
 class LLMEvaluator:
     """
-    Uses an LLM to evaluate inputs based on provided instructions and examples.
+    Uses an LLM to evaluate inputs based on a prompt containing instructions and examples.
 
-    The default api requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
-    The inputs of the component are metric-dependent.
+    The default API requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY".
+    The inputs are lists that are user-defined depending on the desired metric.
     The output is a dictionary with a key `results` containing a list of evaluation results.
+    Each result is a dictionary with user-defined keys and values of either 0 for FALSE or 1 for TRUE respectively.
 
     Usage example:
     ```python
@@ -47,18 +48,22 @@ def __init__(
 
         :param instructions:
             The prompt instructions to use for evaluation.
+            Should be a question about the inputs that can be answered with yes or no.
         :param inputs:
-            The inputs to use for evaluation. Each input is a tuple containing
-            the name of the input and the type of the input.
+            The inputs that the component expects as incoming connections and that it evaluates.
+            Each input is a tuple of an input name and input type. Input types must be lists.
         :param outputs:
-            The output names of the evaluation results.
+            The output names of the evaluation results. They correspond to keys in the output dictionary.
         :param api:
-            The API to use for evaluation.
+            The API to use for calling an LLM through a Generator.
             Supported APIs: "openai".
         :param api_key:
-            The API key to use.
+            The API key.
         :param examples:
-            Few-shot examples conforming to the input and output format.
+            Optional few-shot examples conforming to the expected input and output format as defined in the `inputs` and
+             `outputs` parameters.
+            Each example is a dictionary with keys "inputs" and "outputs"
+            They contain the input and output as dictionaries respectively.
         """
 
         self.instructions = instructions
@@ -81,20 +86,15 @@ def run(self, **inputs) -> Dict[str, Any]:
         Run the LLM evaluator.
 
         :param inputs:
-            The inputs to evaluate. These are determined by the
-            metric being calculated. See :class:`RagasMetric` for more
-            information.
+            The input values to evaluate. The keys are the input names and the values are lists of input values.
         :returns:
-            A nested list of metric results. Each input can have one or more
-            results, depending on the metric. Each result is a dictionary
-            containing the following keys and values:
-                * `name` - The name of the metric.
-                * `score` - The score of the metric.
+            A dictionary with a single `results` entry that contains a list of results.
+            Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
+            and the evaluation results as the values.
         """
         self.validate_input_parameters(dict(self.inputs), inputs)
         self.validate_lengths(*inputs.values())
 
-        results = []
         template = self.prepare_template()
         builder = PromptBuilder(template=template)
 
@@ -103,11 +103,9 @@ def run(self, **inputs) -> Dict[str, Any]:
         input_names, values = inputs.keys(), list(zip(*inputs.values()))
         list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
 
+        results = []
         for input_names_to_values in list_of_input_names_to_values:
             prompt = builder.run(**input_names_to_values)
-            # TODO rendered prompt should contain " instead of ' for filled in values such as responses.
-            #  and for strings it should contain " instead of currently no delimiters
-            #  json.dumps() instead of str() should be used
             result = self.generator.run(prompt=prompt["prompt"])
 
             self.validate_outputs(expected=self.outputs, received=result["replies"][0])
@@ -120,6 +118,9 @@ def run(self, **inputs) -> Dict[str, Any]:
     def prepare_template(self) -> str:
         """
         Combine instructions, inputs, outputs, and examples into one prompt template.
+
+        :returns:
+            The prompt template.
         """
         inputs_section = (
             "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
@@ -130,7 +131,10 @@ def prepare_template(self) -> str:
                 examples_section += (
                     "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"]) + "\n"
                 )
-        return f"Respond only in JSON format with a key {json.dumps(self.outputs)} and a value of either 0 for FALSE or 1 for TRUE.\n{self.instructions}\n{examples_section}Inputs:\n{inputs_section}\nOutputs:\n"
+        return (
+            f"Respond only in JSON format with a key {json.dumps(self.outputs)} and a value of either 0 for FALSE "
+            f"or 1 for TRUE.\n{self.instructions}\n{examples_section}Inputs:\n{inputs_section}\nOutputs:\n"
+        )
 
     def _get_telemetry_data(self) -> Dict[str, Any]:
         """
@@ -175,12 +179,18 @@ def validate_lengths(*lists):
 
         :param lists:
             The lists to validate.
+
+        :raises ValueError:
+            If not all input lists have the same length
         """
         length = len(lists[0])
         if all(len(lst) == length for lst in lists[1:]):
             return True
         else:
-            msg = f"LLM evaluator expects all input lists to have the same length but received {lists} with lengths {[len(lst) for lst in lists]}."
+            msg = (
+                f"LLM evaluator expects all input lists to have the same length but received {lists} with lengths "
+                f"{[len(lst) for lst in lists]}."
+            )
             raise ValueError(msg)
 
     @staticmethod

From 7c3c59b322b70ba9c7ac949c632d1be1c3fddb16 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Thu, 21 Mar 2024 22:13:08 +0100
Subject: [PATCH 08/17] make outputs parameter optional. default:

---
 .../components/evaluators/llm_evaluator.py    |  8 ++--
 .../evaluators/test_llm_evaluator.py          | 43 ++++++++-----------
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index c4c444be71..4e5a2d99a7 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -37,8 +37,8 @@ def __init__(
         self,
         instructions: str,
         inputs: List[Tuple[str, Type]],
-        outputs: List[str],
         *,
+        outputs: Optional[List[str]] = None,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
         examples: Optional[List[Dict[str, Any]]] = None,
@@ -53,7 +53,8 @@ def __init__(
             The inputs that the component expects as incoming connections and that it evaluates.
             Each input is a tuple of an input name and input type. Input types must be lists.
         :param outputs:
-            The output names of the evaluation results. They correspond to keys in the output dictionary.
+            Optional output names of the evaluation results. They correspond to keys in the output dictionary.
+            The default is a single key "score".
         :param api:
             The API to use for calling an LLM through a Generator.
             Supported APIs: "openai".
@@ -65,10 +66,9 @@ def __init__(
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
         """
-
         self.instructions = instructions
         self.inputs = inputs
-        self.outputs = outputs
+        self.outputs = outputs or ["score"]
         self.api = api
         self.api_key = api_key
         self.examples = examples
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 78527b1075..da2816f8de 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -10,7 +10,7 @@
 class TestLLMEvaluator:
     def test_init_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
         assert component.api == "openai"
         assert component.generator.client.api_key == "test-api-key"
         assert component.instructions == "test-instruction"
@@ -21,35 +21,33 @@ def test_init_default(self, monkeypatch):
     def test_init_fail_wo_openai_api_key(self, monkeypatch):
         monkeypatch.delenv("OPENAI_API_KEY", raising=False)
         with pytest.raises(ValueError, match="None of the .* environment variables are set"):
-            LLMEvaluator(
-                api="openai", instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"]
-            )
+            LLMEvaluator(api="openai", instructions="test-instruction", inputs=[("responses", List[str])])
 
     def test_init_with_parameters(self):
         component = LLMEvaluator(
             instructions="test-instruction",
             api_key=Secret.from_token("test-api-key"),
             inputs=[("responses", List[str])],
-            outputs=["score"],
+            outputs=["custom_score"],
             api="openai",
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
             ],
         )
         assert component.generator.client.api_key == "test-api-key"
         assert component.api == "openai"
         assert component.examples == [
-            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+            {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
         ]
         assert component.instructions == "test-instruction"
         assert component.inputs == [("responses", List[str])]
-        assert component.outputs == ["score"]
+        assert component.outputs == ["custom_score"]
 
     def test_to_dict_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
         data = component.to_dict()
         assert data == {
             "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
@@ -69,11 +67,11 @@ def test_to_dict_with_parameters(self, monkeypatch):
             instructions="test-instruction",
             api_key=Secret.from_env_var("ENV_VAR"),
             inputs=[("responses", List[str])],
-            outputs=["score"],
+            outputs=["custom_score"],
             api="openai",
             examples=[
-                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+                {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
             ],
         )
         data = component.to_dict()
@@ -84,10 +82,10 @@ def test_to_dict_with_parameters(self, monkeypatch):
                 "api": "openai",
                 "instructions": "test-instruction",
                 "inputs": [("responses", List[str])],
-                "outputs": ["score"],
+                "outputs": ["custom_score"],
                 "examples": [
-                    {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
-                    {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
+                    {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
+                    {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
                 ],
             },
         }
@@ -141,7 +139,7 @@ def test_prepare_template_with_examples(self, monkeypatch):
 
     def test_invalid_input_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
         with pytest.raises(ValueError):
             component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
 
@@ -152,7 +150,7 @@ def test_invalid_input_parameters(self, monkeypatch):
 
     def test_invalid_outputs(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
+        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
         with pytest.raises(ValueError):
             component.validate_outputs(expected=["score", "another_expected_output"], received="{'score': 1.0}")
 
@@ -161,9 +159,4 @@ def test_invalid_outputs(self, monkeypatch):
 
     def test_unsupported_api(self):
         with pytest.raises(ValueError):
-            LLMEvaluator(
-                api="unsupported_api",
-                instructions="test-instruction",
-                inputs=[("responses", List[str])],
-                outputs=["score"],
-            )
+            LLMEvaluator(api="unsupported_api", instructions="test-instruction", inputs=[("responses", List[str])])

From 980f2011bb0616ddf52a63ecfe8f5282ebe730a2 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 14:28:12 +0100
Subject: [PATCH 09/17] validate init parameters

---
 .../components/evaluators/llm_evaluator.py    | 117 ++++++++++++------
 .../evaluators/test_llm_evaluator.py          |  45 +++++++
 2 files changed, 126 insertions(+), 36 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 4e5a2d99a7..624488ea14 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -36,7 +36,7 @@ class LLMEvaluator:
     def __init__(
         self,
         instructions: str,
-        inputs: List[Tuple[str, Type]],
+        inputs: List[Tuple[str, Type[List]]],
         *,
         outputs: Optional[List[str]] = None,
         api: str = "openai",
@@ -66,21 +66,80 @@ def __init__(
             Each example is a dictionary with keys "inputs" and "outputs"
             They contain the input and output as dictionaries respectively.
         """
+        self.validate_init_parameters(inputs, outputs, examples)
+
         self.instructions = instructions
         self.inputs = inputs
         self.outputs = outputs or ["score"]
         self.api = api
         self.api_key = api_key
         self.examples = examples
-        expected_inputs = dict(inputs)
+
         if api == "openai":
             self.generator = OpenAIGenerator(api_key=api_key)
         else:
             raise ValueError(f"Unsupported API: {api}")
 
-        component.set_input_types(self, **expected_inputs)
+        template = self.prepare_template()
+        self.builder = PromptBuilder(template=template)
+
+        component.set_input_types(self, **dict(inputs))
+
+    def validate_init_parameters(
+        self,
+        inputs: List[Tuple[str, Type[List]]],
+        outputs: Optional[List[str]],
+        examples: Optional[List[Dict[str, Any]]],
+    ):
+        """
+        Validate the init parameters.
+
+        :param inputs:
+            The inputs to validate.
+        :param outputs:
+            The outputs to validate.
+        :param examples:
+            The examples to validate.
+
+        :raises ValueError:
+            If the inputs are not a list of tuples with a string and a type of list.
+            If the outputs are not a list of strings.
+            If the examples are not a list of dictionaries.
+        """
+        # Validate inputs
+        if (
+            not isinstance(inputs, List)
+            or not all(isinstance(input, Tuple) for input in inputs)
+            or not all(isinstance(input[0], str) and input[1] is not List and len(input) == 2 for input in inputs)
+        ):
+            msg = (
+                f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and "
+                f"type of list but received {inputs}."
+            )
+            raise ValueError(msg)
+
+        # Validate outputs
+        if (
+            outputs is not None
+            and not isinstance(outputs, List)
+            or not all(isinstance(output, str) for output in outputs)
+        ):
+            msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
+            raise ValueError(msg)
+
+        # Validate examples
+        if examples is not None and (
+            not isinstance(examples, List)
+            or not all(isinstance(example, Dict) for example in examples)
+            or not all({"inputs", "outputs"} == example.keys() for example in examples)
+        ):
+            msg = (
+                f"LLM evaluator expects examples to be a list of dictionaries with keys `inputs` and `outputs` "
+                f"but received {examples}."
+            )
+            raise ValueError(msg)
 
-    @component.output_types(results=List[List[Dict[str, Any]]])
+    @component.output_types(results=List[Dict[str, Any]])
     def run(self, **inputs) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
@@ -93,10 +152,6 @@ def run(self, **inputs) -> Dict[str, Any]:
             and the evaluation results as the values.
         """
         self.validate_input_parameters(dict(self.inputs), inputs)
-        self.validate_lengths(*inputs.values())
-
-        template = self.prepare_template()
-        builder = PromptBuilder(template=template)
 
         # inputs is a dictionary with keys being input names and values being a list of input values
         # We need to iterate through the lists in parallel for all keys of the dictionary
@@ -105,7 +160,7 @@ def run(self, **inputs) -> Dict[str, Any]:
 
         results = []
         for input_names_to_values in list_of_input_names_to_values:
-            prompt = builder.run(**input_names_to_values)
+            prompt = self.builder.run(**input_names_to_values)
             result = self.generator.run(prompt=prompt["prompt"])
 
             self.validate_outputs(expected=self.outputs, received=result["replies"][0])
@@ -136,12 +191,6 @@ def prepare_template(self) -> str:
             f"or 1 for TRUE.\n{self.instructions}\n{examples_section}Inputs:\n{inputs_section}\nOutputs:\n"
         )
 
-    def _get_telemetry_data(self) -> Dict[str, Any]:
-        """
-        Data that is sent to Posthog for usage analytics.
-        """
-        return {"api": self.api}
-
     def to_dict(self) -> Dict[str, Any]:
         """
         Serialize this component to a dictionary.
@@ -172,27 +221,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator":
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
-    @staticmethod
-    def validate_lengths(*lists):
-        """
-        Validate that all input lists have the same length.
-
-        :param lists:
-            The lists to validate.
-
-        :raises ValueError:
-            If not all input lists have the same length
-        """
-        length = len(lists[0])
-        if all(len(lst) == length for lst in lists[1:]):
-            return True
-        else:
-            msg = (
-                f"LLM evaluator expects all input lists to have the same length but received {lists} with lengths "
-                f"{[len(lst) for lst in lists]}."
-            )
-            raise ValueError(msg)
-
     @staticmethod
     def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None:
         """
@@ -205,12 +233,29 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]
 
         :raises ValueError:
             If not all expected inputs are present in the received inputs
+            If the received inputs are not lists or have different lengths
         """
+        # Validate that all expected inputs are present in the received inputs
         for param in expected.keys():
             if param not in received:
                 msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}."
                 raise ValueError(msg)
 
+        # Validate that all received inputs are lists
+        if not all(isinstance(input, list) for input in received.values()):
+            msg = f"LLM evaluator expects all input values to be lists but received {[type(input) for input in received.values()]}."
+            raise ValueError(msg)
+
+        # Validate that all received inputs are of the same length
+        inputs = received.values()
+        length = len(next(iter(inputs)))
+        if not all(len(input) == length for input in inputs):
+            msg = (
+                f"LLM evaluator expects all input lists to have the same length but received {inputs} with lengths "
+                f"{[len(input) for input in inputs]}."
+            )
+            raise ValueError(msg)
+
     @staticmethod
     def validate_outputs(expected: List[str], received: str) -> None:
         """
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index da2816f8de..9212217e94 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -45,6 +45,51 @@ def test_init_with_parameters(self):
         assert component.inputs == [("responses", List[str])]
         assert component.outputs == ["custom_score"]
 
+    def test_init_with_invalid_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        # Invalid inputs
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs={("responses", List[str])})
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs=[(List[str], "responses")])
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs=[List[str]])
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs={("responses", str)})
+
+        # Invalid outputs
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs="score")
+        with pytest.raises(ValueError):
+            LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=[["score"]])
+
+        # Invalid examples
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                examples={
+                    "inputs": {"responses": "Damn, this is straight outta hell!!!"},
+                    "outputs": {"custom_score": 1},
+                },
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                examples=[
+                    [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}]
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                examples=[
+                    {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
+                ],
+            )
+
     def test_to_dict_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])

From 821372336616f5c8d177adfabcf4c6e22f1d8f36 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 14:44:47 +0100
Subject: [PATCH 10/17] linting

---
 haystack/components/evaluators/llm_evaluator.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 624488ea14..e48a697d35 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -108,9 +108,9 @@ def validate_init_parameters(
         """
         # Validate inputs
         if (
-            not isinstance(inputs, List)
-            or not all(isinstance(input, Tuple) for input in inputs)
-            or not all(isinstance(input[0], str) and input[1] is not List and len(input) == 2 for input in inputs)
+            not isinstance(inputs, list)
+            or not all(isinstance(input, tuple) for input in inputs)
+            or not all(isinstance(input[0], str) and input[1] is not list and len(input) == 2 for input in inputs)
         ):
             msg = (
                 f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and "
@@ -119,18 +119,16 @@ def validate_init_parameters(
             raise ValueError(msg)
 
         # Validate outputs
-        if (
-            outputs is not None
-            and not isinstance(outputs, List)
-            or not all(isinstance(output, str) for output in outputs)
+        if outputs is not None and (
+            not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs)
         ):
             msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
             raise ValueError(msg)
 
         # Validate examples
         if examples is not None and (
-            not isinstance(examples, List)
-            or not all(isinstance(example, Dict) for example in examples)
+            not isinstance(examples, list)
+            or not all(isinstance(example, dict) for example in examples)
             or not all({"inputs", "outputs"} == example.keys() for example in examples)
         ):
             msg = (

From 0a87bf6eee12d9a4bc8e1f2e41af2c073dc96a40 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 15:07:07 +0100
Subject: [PATCH 11/17] remove mention of binary scores from template

---
 .../components/evaluators/llm_evaluator.py    | 42 +++++++++++++++----
 .../evaluators/test_llm_evaluator.py          |  5 +--
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index e48a697d35..cd063c4fad 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -170,7 +170,20 @@ def run(self, **inputs) -> Dict[str, Any]:
 
     def prepare_template(self) -> str:
         """
-        Combine instructions, inputs, outputs, and examples into one prompt template.
+        Combine instructions, inputs, outputs, and examples into one prompt template with the following format:
+        Instructions:
+        <instructions>
+
+        Generate the response in JSON format with the following keys:
+        <list of output keys>
+        Consider the instructions and the examples below to determine those values.
+
+        Examples:
+        <examples>
+
+        Inputs:
+        <inputs>
+        Outputs:
 
         :returns:
             The prompt template.
@@ -178,15 +191,30 @@ def prepare_template(self) -> str:
         inputs_section = (
             "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
         )
-        examples_section = ""
+
         if self.examples:
-            for example in self.examples:
-                examples_section += (
-                    "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"]) + "\n"
+            examples_section = (
+                "Examples:\n"
+                + "\n".join(
+                    [
+                        "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
+                        for example in self.examples
+                    ]
                 )
+                + "\n\n"
+            )
+        else:
+            examples_section = ""
         return (
-            f"Respond only in JSON format with a key {json.dumps(self.outputs)} and a value of either 0 for FALSE "
-            f"or 1 for TRUE.\n{self.instructions}\n{examples_section}Inputs:\n{inputs_section}\nOutputs:\n"
+            f"Instructions:\n"
+            f"{self.instructions}\n\n"
+            f"Generate the response in JSON format with the following keys:\n"
+            f"{json.dumps(self.outputs)}\n"
+            f"Consider the instructions and the examples below to determine those values.\n\n"
+            f"{examples_section}"
+            f"Inputs:\n"
+            f"{inputs_section}\n"
+            f"Outputs:\n"
         )
 
     def to_dict(self) -> Dict[str, Any]:
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 9212217e94..aa004742f9 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -1,4 +1,3 @@
-import os
 from typing import List
 
 import pytest
@@ -162,7 +161,7 @@ def test_prepare_template_wo_examples(self, monkeypatch):
         template = component.prepare_template()
         assert (
             template
-            == 'Respond only in JSON format with a key ["score"] and a value of either 0 for FALSE or 1 for TRUE.\ntest-instruction\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
         )
 
     def test_prepare_template_with_examples(self, monkeypatch):
@@ -179,7 +178,7 @@ def test_prepare_template_with_examples(self, monkeypatch):
         template = component.prepare_template()
         assert (
             template
-            == 'Respond only in JSON format with a key ["score"] and a value of either 0 for FALSE or 1 for TRUE.\ntest-instruction\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
+            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
         )
 
     def test_invalid_input_parameters(self, monkeypatch):

From 3a2ad6a01570c475142bafee3b90befd49554458 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 16:05:37 +0100
Subject: [PATCH 12/17] make examples and outputs params non-optional

---
 .../components/evaluators/llm_evaluator.py    |  53 ++++-----
 .../evaluators/test_llm_evaluator.py          | 107 ++++++++++++++----
 2 files changed, 106 insertions(+), 54 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index cd063c4fad..46a757780b 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -1,5 +1,5 @@
 import json
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, List, Tuple, Type
 
 from haystack import component, default_from_dict, default_to_dict
 from haystack.components.builders import PromptBuilder
@@ -37,11 +37,11 @@ def __init__(
         self,
         instructions: str,
         inputs: List[Tuple[str, Type[List]]],
+        outputs: List[str],
+        examples: List[Dict[str, Any]],
         *,
-        outputs: Optional[List[str]] = None,
         api: str = "openai",
         api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
-        examples: Optional[List[Dict[str, Any]]] = None,
     ):
         """
         Creates an instance of LLMEvaluator.
@@ -53,18 +53,19 @@ def __init__(
             The inputs that the component expects as incoming connections and that it evaluates.
             Each input is a tuple of an input name and input type. Input types must be lists.
         :param outputs:
-            Optional output names of the evaluation results. They correspond to keys in the output dictionary.
+            Output names of the evaluation results. They correspond to keys in the output dictionary.
             The default is a single key "score".
+        :param examples:
+            Few-shot examples conforming to the expected input and output format as defined in the `inputs` and
+             `outputs` parameters.
+            Each example is a dictionary with keys "inputs" and "outputs"
+            They contain the input and output as dictionaries respectively.
         :param api:
             The API to use for calling an LLM through a Generator.
             Supported APIs: "openai".
         :param api_key:
             The API key.
-        :param examples:
-            Optional few-shot examples conforming to the expected input and output format as defined in the `inputs` and
-             `outputs` parameters.
-            Each example is a dictionary with keys "inputs" and "outputs"
-            They contain the input and output as dictionaries respectively.
+
         """
         self.validate_init_parameters(inputs, outputs, examples)
 
@@ -86,10 +87,7 @@ def __init__(
         component.set_input_types(self, **dict(inputs))
 
     def validate_init_parameters(
-        self,
-        inputs: List[Tuple[str, Type[List]]],
-        outputs: Optional[List[str]],
-        examples: Optional[List[Dict[str, Any]]],
+        self, inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]]
     ):
         """
         Validate the init parameters.
@@ -119,14 +117,12 @@ def validate_init_parameters(
             raise ValueError(msg)
 
         # Validate outputs
-        if outputs is not None and (
-            not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs)
-        ):
+        if not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs):
             msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
             raise ValueError(msg)
 
         # Validate examples
-        if examples is not None and (
+        if (
             not isinstance(examples, list)
             or not all(isinstance(example, dict) for example in examples)
             or not all({"inputs", "outputs"} == example.keys() for example in examples)
@@ -192,19 +188,16 @@ def prepare_template(self) -> str:
             "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
         )
 
-        if self.examples:
-            examples_section = (
-                "Examples:\n"
-                + "\n".join(
-                    [
-                        "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
-                        for example in self.examples
-                    ]
-                )
-                + "\n\n"
+        examples_section = (
+            "Examples:\n"
+            + "\n".join(
+                [
+                    "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
+                    for example in self.examples
+                ]
             )
-        else:
-            examples_section = ""
+            + "\n\n"
+        )
         return (
             f"Instructions:\n"
             f"{self.instructions}\n\n"
@@ -229,9 +222,9 @@ def to_dict(self) -> Dict[str, Any]:
             instructions=self.instructions,
             inputs=self.inputs,
             outputs=self.outputs,
+            examples=self.examples,
             api=self.api,
             api_key=self.api_key.to_dict(),
-            examples=self.examples,
         )
 
     @classmethod
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index aa004742f9..9584c4cbf2 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -9,18 +9,31 @@
 class TestLLMEvaluator:
     def test_init_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
         assert component.api == "openai"
         assert component.generator.client.api_key == "test-api-key"
         assert component.instructions == "test-instruction"
         assert component.inputs == [("responses", List[str])]
         assert component.outputs == ["score"]
-        assert component.examples == None
+        assert component.examples == [
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+        ]
 
     def test_init_fail_wo_openai_api_key(self, monkeypatch):
         monkeypatch.delenv("OPENAI_API_KEY", raising=False)
         with pytest.raises(ValueError, match="None of the .* environment variables are set"):
-            LLMEvaluator(api="openai", instructions="test-instruction", inputs=[("responses", List[str])])
+            LLMEvaluator(
+                api="openai",
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
 
     def test_init_with_parameters(self):
         component = LLMEvaluator(
@@ -48,25 +61,56 @@ def test_init_with_invalid_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         # Invalid inputs
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs={("responses", List[str])})
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs={("responses", List[str])},
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs=[(List[str], "responses")])
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[(List[str], "responses")],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs=[List[str]])
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[List[str]],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs={("responses", str)})
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs={("responses", str)},
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
 
         # Invalid outputs
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs="score")
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs="score",
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
         with pytest.raises(ValueError):
-            LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=[["score"]])
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=[["score"]],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )
 
         # Invalid examples
         with pytest.raises(ValueError):
             LLMEvaluator(
                 instructions="test-instruction",
                 inputs=[("responses", List[str])],
+                outputs=["score"],
                 examples={
                     "inputs": {"responses": "Damn, this is straight outta hell!!!"},
                     "outputs": {"custom_score": 1},
@@ -76,6 +120,7 @@ def test_init_with_invalid_parameters(self, monkeypatch):
             LLMEvaluator(
                 instructions="test-instruction",
                 inputs=[("responses", List[str])],
+                outputs=["score"],
                 examples=[
                     [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}]
                 ],
@@ -84,6 +129,7 @@ def test_init_with_invalid_parameters(self, monkeypatch):
             LLMEvaluator(
                 instructions="test-instruction",
                 inputs=[("responses", List[str])],
+                outputs=["score"],
                 examples=[
                     {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
                 ],
@@ -91,7 +137,12 @@ def test_init_with_invalid_parameters(self, monkeypatch):
 
     def test_to_dict_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
         data = component.to_dict()
         assert data == {
             "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
@@ -101,7 +152,7 @@ def test_to_dict_default(self, monkeypatch):
                 "instructions": "test-instruction",
                 "inputs": [("responses", List[str])],
                 "outputs": ["score"],
-                "examples": None,
+                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
             },
         }
 
@@ -140,6 +191,7 @@ def test_run_with_different_lengths(self, monkeypatch):
             instructions="test-instruction",
             inputs=[("questions", List[str]), ("responses", List[List[str]])],
             outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
         )
 
         def generator_run(self, *args, **kwargs):
@@ -155,16 +207,7 @@ def generator_run(self, *args, **kwargs):
                 questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
             )
 
-    def test_prepare_template_wo_examples(self, monkeypatch):
-        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])], outputs=["score"])
-        template = component.prepare_template()
-        assert (
-            template
-            == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n'
-        )
-
-    def test_prepare_template_with_examples(self, monkeypatch):
+    def test_prepare_template(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
             instructions="test-instruction",
@@ -183,7 +226,12 @@ def test_prepare_template_with_examples(self, monkeypatch):
 
     def test_invalid_input_parameters(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
         with pytest.raises(ValueError):
             component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
 
@@ -194,7 +242,12 @@ def test_invalid_input_parameters(self, monkeypatch):
 
     def test_invalid_outputs(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
-        component = LLMEvaluator(instructions="test-instruction", inputs=[("responses", List[str])])
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("responses", List[str])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
         with pytest.raises(ValueError):
             component.validate_outputs(expected=["score", "another_expected_output"], received="{'score': 1.0}")
 
@@ -203,4 +256,10 @@ def test_invalid_outputs(self, monkeypatch):
 
     def test_unsupported_api(self):
         with pytest.raises(ValueError):
-            LLMEvaluator(api="unsupported_api", instructions="test-instruction", inputs=[("responses", List[str])])
+            LLMEvaluator(
+                api="unsupported_api",
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            )

From d25ad80182ad3344e9f368bffdaafa3af0c30b39 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 16:09:58 +0100
Subject: [PATCH 13/17] removed leftover from optional outputs param

---
 haystack/components/evaluators/llm_evaluator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 46a757780b..6b19b223dd 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -71,10 +71,10 @@ def __init__(
 
         self.instructions = instructions
         self.inputs = inputs
-        self.outputs = outputs or ["score"]
+        self.outputs = outputs
+        self.examples = examples
         self.api = api
         self.api_key = api_key
-        self.examples = examples
 
         if api == "openai":
             self.generator = OpenAIGenerator(api_key=api_key)

From e64b37f07d08c1d4f240478673ff0382a42b3721 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 16:13:45 +0100
Subject: [PATCH 14/17] simplify building examples section for template

---
 haystack/components/evaluators/llm_evaluator.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 6b19b223dd..29b577b42a 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -188,15 +188,11 @@ def prepare_template(self) -> str:
             "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
         )
 
-        examples_section = (
-            "Examples:\n"
-            + "\n".join(
-                [
-                    "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
-                    for example in self.examples
-                ]
-            )
-            + "\n\n"
+        examples_section = "\n".join(
+            [
+                "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"])
+                for example in self.examples
+            ]
         )
         return (
             f"Instructions:\n"
@@ -204,7 +200,8 @@ def prepare_template(self) -> str:
             f"Generate the response in JSON format with the following keys:\n"
             f"{json.dumps(self.outputs)}\n"
             f"Consider the instructions and the examples below to determine those values.\n\n"
-            f"{examples_section}"
+            f"Examples:\n"
+            f"{examples_section}\n\n"
             f"Inputs:\n"
             f"{inputs_section}\n"
             f"Outputs:\n"

From 9f00a46ccdced0d303ed9c34cf862fe0eee0561b Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 22 Mar 2024 17:30:14 +0100
Subject: [PATCH 15/17] validate inputs and outputs in examples are dict with
 str as key

---
 .../components/evaluators/llm_evaluator.py    |  5 +++++
 .../evaluators/test_llm_evaluator.py          | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index 29b577b42a..a63884c5d4 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -126,6 +126,11 @@ def validate_init_parameters(
             not isinstance(examples, list)
             or not all(isinstance(example, dict) for example in examples)
             or not all({"inputs", "outputs"} == example.keys() for example in examples)
+            or not all(
+                isinstance(example["inputs"], dict) and isinstance(example["outputs"], dict) for example in examples
+            )
+            or not all(isinstance(key, str) for example in examples for key in example["inputs"])
+            or not all(isinstance(key, str) for example in examples for key in example["outputs"])
         ):
             msg = (
                 f"LLM evaluator expects examples to be a list of dictionaries with keys `inputs` and `outputs` "
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 9584c4cbf2..41e6c5e61d 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -134,6 +134,25 @@ def test_init_with_invalid_parameters(self, monkeypatch):
                     {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}
                 ],
             )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[
+                    {
+                        "inputs": [{"responses": "Damn, this is straight outta hell!!!"}],
+                        "outputs": [{"custom_score": 1}],
+                    }
+                ],
+            )
+        with pytest.raises(ValueError):
+            LLMEvaluator(
+                instructions="test-instruction",
+                inputs=[("responses", List[str])],
+                outputs=["score"],
+                examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
+            )
 
     def test_to_dict_default(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")

From 45e9bd914f5c25e5d07e9aa29fe4d5eecf6e79a4 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Sun, 24 Mar 2024 19:49:31 +0100
Subject: [PATCH 16/17] fix pylint too-many-boolean-expressions

---
 .../components/evaluators/llm_evaluator.py    | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
index a63884c5d4..e5197f32c3 100644
--- a/haystack/components/evaluators/llm_evaluator.py
+++ b/haystack/components/evaluators/llm_evaluator.py
@@ -103,6 +103,7 @@ def validate_init_parameters(
             If the inputs are not a list of tuples with a string and a type of list.
             If the outputs are not a list of strings.
             If the examples are not a list of dictionaries.
+            If any example does not have keys "inputs" and "outputs" with values that are dictionaries with string keys.
         """
         # Validate inputs
         if (
@@ -121,23 +122,24 @@ def validate_init_parameters(
             msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}."
             raise ValueError(msg)
 
-        # Validate examples
-        if (
-            not isinstance(examples, list)
-            or not all(isinstance(example, dict) for example in examples)
-            or not all({"inputs", "outputs"} == example.keys() for example in examples)
-            or not all(
-                isinstance(example["inputs"], dict) and isinstance(example["outputs"], dict) for example in examples
-            )
-            or not all(isinstance(key, str) for example in examples for key in example["inputs"])
-            or not all(isinstance(key, str) for example in examples for key in example["outputs"])
-        ):
-            msg = (
-                f"LLM evaluator expects examples to be a list of dictionaries with keys `inputs` and `outputs` "
-                f"but received {examples}."
-            )
+        # Validate examples are lists of dicts
+        if not isinstance(examples, list) or not all(isinstance(example, dict) for example in examples):
+            msg = f"LLM evaluator expects examples to be a list of dictionaries but received {examples}."
             raise ValueError(msg)
 
+        # Validate each example
+        for example in examples:
+            if (
+                {"inputs", "outputs"} != example.keys()
+                or not all(isinstance(example[param], dict) for param in ["inputs", "outputs"])
+                or not all(isinstance(key, str) for param in ["inputs", "outputs"] for key in example[param])
+            ):
+                msg = (
+                    f"LLM evaluator expects each example to have keys `inputs` and `outputs` with values that are "
+                    f"dictionaries with str keys but received {example}."
+                )
+                raise ValueError(msg)
+
     @component.output_types(results=List[Dict[str, Any]])
     def run(self, **inputs) -> Dict[str, Any]:
         """

From bf9ba0ca8910ce26a9a1afad44d81bf011d5a503 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Sun, 24 Mar 2024 20:43:59 +0100
Subject: [PATCH 17/17] increase test coverage

---
 .../evaluators/test_llm_evaluator.py          | 53 +++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
index 41e6c5e61d..df97d6c38e 100644
--- a/test/components/evaluators/test_llm_evaluator.py
+++ b/test/components/evaluators/test_llm_evaluator.py
@@ -175,6 +175,30 @@ def test_to_dict_default(self, monkeypatch):
             },
         }
 
+    def test_from_dict(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+
+        data = {
+            "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
+            "init_parameters": {
+                "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
+                "api": "openai",
+                "instructions": "test-instruction",
+                "inputs": [("responses", List[str])],
+                "outputs": ["score"],
+                "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+            },
+        }
+        component = LLMEvaluator.from_dict(data)
+        assert component.api == "openai"
+        assert component.generator.client.api_key == "test-api-key"
+        assert component.instructions == "test-instruction"
+        assert component.inputs == [("responses", List[str])]
+        assert component.outputs == ["score"]
+        assert component.examples == [
+            {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
+        ]
+
     def test_to_dict_with_parameters(self, monkeypatch):
         monkeypatch.setenv("ENV_VAR", "test-api-key")
         component = LLMEvaluator(
@@ -214,7 +238,7 @@ def test_run_with_different_lengths(self, monkeypatch):
         )
 
         def generator_run(self, *args, **kwargs):
-            return {"replies": [{"score": 0.5}]}
+            return {"replies": ['{"score": 0.5}']}
 
         monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
 
@@ -226,6 +250,23 @@ def generator_run(self, *args, **kwargs):
                 questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]]
             )
 
+    def test_run_returns_parsed_result(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = LLMEvaluator(
+            instructions="test-instruction",
+            inputs=[("questions", List[str]), ("responses", List[List[str]])],
+            outputs=["score"],
+            examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
+        )
+
+        def generator_run(self, *args, **kwargs):
+            return {"replies": ['{"score": 0.5}']}
+
+        monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
+
+        results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"])
+        assert results == {"results": [{"score": 0.5, "name": "llm"}]}
+
     def test_prepare_template(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
@@ -251,14 +292,20 @@ def test_invalid_input_parameters(self, monkeypatch):
             outputs=["score"],
             examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
         )
+        # None of the expected parameters are received
         with pytest.raises(ValueError):
             component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]})
 
+        # Only one but not all the expected parameters are received
         with pytest.raises(ValueError):
             component.validate_input_parameters(
                 expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]}
             )
 
+        # Received inputs are not lists
+        with pytest.raises(ValueError):
+            component.validate_input_parameters(expected={"questions": List[str]}, received={"questions": str})
+
     def test_invalid_outputs(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = LLMEvaluator(
@@ -268,10 +315,10 @@ def test_invalid_outputs(self, monkeypatch):
             examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
         )
         with pytest.raises(ValueError):
-            component.validate_outputs(expected=["score", "another_expected_output"], received="{'score': 1.0}")
+            component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}')
 
         with pytest.raises(ValueError):
-            component.validate_outputs(expected=["score"], received="{'wrong_name': 1.0}")
+            component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}')
 
     def test_unsupported_api(self):
         with pytest.raises(ValueError):