diff --git a/haystack/components/evaluators/__init__.py b/haystack/components/evaluators/__init__.py index 9550a5f42d..88e78e5756 100644 --- a/haystack/components/evaluators/__init__.py +++ b/haystack/components/evaluators/__init__.py @@ -1,3 +1,4 @@ from .answer_exact_match import AnswerExactMatchEvaluator +from .llm_evaluator import LLMEvaluator -__all__ = ["AnswerExactMatchEvaluator"] +__all__ = ["AnswerExactMatchEvaluator", "LLMEvaluator"] diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py new file mode 100644 index 0000000000..e5197f32c3 --- /dev/null +++ b/haystack/components/evaluators/llm_evaluator.py @@ -0,0 +1,298 @@ +import json +from typing import Any, Dict, List, Tuple, Type + +from haystack import component, default_from_dict, default_to_dict +from haystack.components.builders import PromptBuilder +from haystack.components.generators import OpenAIGenerator +from haystack.utils import Secret, deserialize_secrets_inplace + + +@component +class LLMEvaluator: + """ + Uses an LLM to evaluate inputs based on a prompt containing instructions and examples. + + The default API requires an OpenAI API key to be provided as an environment variable "OPENAI_API_KEY". + The inputs are lists that are user-defined depending on the desired metric. + The output is a dictionary with a key `results` containing a list of evaluation results. + Each result is a dictionary with user-defined keys and values of either 0 for FALSE or 1 for TRUE respectively. + + Usage example: + ```python + from haystack.components.evaluators import LLMEvaluator + evaluator = LLMEvaluator( + instructions="Is this answer problematic for children?", + inputs=[("responses", List[str])], + outputs=["score"], + ) + RESPONSES = [ + "Football is the most popular sport with around 4 billion followers worldwide", + "Python language was created by Guido van Rossum.", + ] + results = evaluator.run(responses=RESPONSES) + ``` + """ + + def __init__( + self, + instructions: str, + inputs: List[Tuple[str, Type[List]]], + outputs: List[str], + examples: List[Dict[str, Any]], + *, + api: str = "openai", + api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + ): + """ + Creates an instance of LLMEvaluator. + + :param instructions: + The prompt instructions to use for evaluation. + Should be a question about the inputs that can be answered with yes or no. + :param inputs: + The inputs that the component expects as incoming connections and that it evaluates. + Each input is a tuple of an input name and input type. Input types must be lists. + :param outputs: + Output names of the evaluation results. They correspond to keys in the output dictionary. + The default is a single key "score". + :param examples: + Few-shot examples conforming to the expected input and output format as defined in the `inputs` and + `outputs` parameters. + Each example is a dictionary with keys "inputs" and "outputs" + They contain the input and output as dictionaries respectively. + :param api: + The API to use for calling an LLM through a Generator. + Supported APIs: "openai". + :param api_key: + The API key. + + """ + self.validate_init_parameters(inputs, outputs, examples) + + self.instructions = instructions + self.inputs = inputs + self.outputs = outputs + self.examples = examples + self.api = api + self.api_key = api_key + + if api == "openai": + self.generator = OpenAIGenerator(api_key=api_key) + else: + raise ValueError(f"Unsupported API: {api}") + + template = self.prepare_template() + self.builder = PromptBuilder(template=template) + + component.set_input_types(self, **dict(inputs)) + + def validate_init_parameters( + self, inputs: List[Tuple[str, Type[List]]], outputs: List[str], examples: List[Dict[str, Any]] + ): + """ + Validate the init parameters. + + :param inputs: + The inputs to validate. + :param outputs: + The outputs to validate. + :param examples: + The examples to validate. + + :raises ValueError: + If the inputs are not a list of tuples with a string and a type of list. + If the outputs are not a list of strings. + If the examples are not a list of dictionaries. + If any example does not have keys "inputs" and "outputs" with values that are dictionaries with string keys. + """ + # Validate inputs + if ( + not isinstance(inputs, list) + or not all(isinstance(input, tuple) for input in inputs) + or not all(isinstance(input[0], str) and input[1] is not list and len(input) == 2 for input in inputs) + ): + msg = ( + f"LLM evaluator expects inputs to be a list of tuples. Each tuple must contain an input name and " + f"type of list but received {inputs}." + ) + raise ValueError(msg) + + # Validate outputs + if not isinstance(outputs, list) or not all(isinstance(output, str) for output in outputs): + msg = f"LLM evaluator expects outputs to be a list of str but received {outputs}." + raise ValueError(msg) + + # Validate examples are lists of dicts + if not isinstance(examples, list) or not all(isinstance(example, dict) for example in examples): + msg = f"LLM evaluator expects examples to be a list of dictionaries but received {examples}." + raise ValueError(msg) + + # Validate each example + for example in examples: + if ( + {"inputs", "outputs"} != example.keys() + or not all(isinstance(example[param], dict) for param in ["inputs", "outputs"]) + or not all(isinstance(key, str) for param in ["inputs", "outputs"] for key in example[param]) + ): + msg = ( + f"LLM evaluator expects each example to have keys `inputs` and `outputs` with values that are " + f"dictionaries with str keys but received {example}." + ) + raise ValueError(msg) + + @component.output_types(results=List[Dict[str, Any]]) + def run(self, **inputs) -> Dict[str, Any]: + """ + Run the LLM evaluator. + + :param inputs: + The input values to evaluate. The keys are the input names and the values are lists of input values. + :returns: + A dictionary with a single `results` entry that contains a list of results. + Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator + and the evaluation results as the values. + """ + self.validate_input_parameters(dict(self.inputs), inputs) + + # inputs is a dictionary with keys being input names and values being a list of input values + # We need to iterate through the lists in parallel for all keys of the dictionary + input_names, values = inputs.keys(), list(zip(*inputs.values())) + list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] + + results = [] + for input_names_to_values in list_of_input_names_to_values: + prompt = self.builder.run(**input_names_to_values) + result = self.generator.run(prompt=prompt["prompt"]) + + self.validate_outputs(expected=self.outputs, received=result["replies"][0]) + parsed_result = json.loads(result["replies"][0]) + parsed_result["name"] = "llm" + results.append(parsed_result) + + return {"results": results} + + def prepare_template(self) -> str: + """ + Combine instructions, inputs, outputs, and examples into one prompt template with the following format: + Instructions: + + + Generate the response in JSON format with the following keys: + + Consider the instructions and the examples below to determine those values. + + Examples: + + + Inputs: + + Outputs: + + :returns: + The prompt template. + """ + inputs_section = ( + "{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}" + ) + + examples_section = "\n".join( + [ + "Inputs:\n" + json.dumps(example["inputs"]) + "\nOutputs:\n" + json.dumps(example["outputs"]) + for example in self.examples + ] + ) + return ( + f"Instructions:\n" + f"{self.instructions}\n\n" + f"Generate the response in JSON format with the following keys:\n" + f"{json.dumps(self.outputs)}\n" + f"Consider the instructions and the examples below to determine those values.\n\n" + f"Examples:\n" + f"{examples_section}\n\n" + f"Inputs:\n" + f"{inputs_section}\n" + f"Outputs:\n" + ) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. + """ + return default_to_dict( + self, + instructions=self.instructions, + inputs=self.inputs, + outputs=self.outputs, + examples=self.examples, + api=self.api, + api_key=self.api_key.to_dict(), + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "LLMEvaluator": + """ + Deserialize this component from a dictionary. + + :param data: + The dictionary representation of this component. + :returns: + The deserialized component instance. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + @staticmethod + def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any]) -> None: + """ + Validate the input parameters. + + :param expected: + The expected input parameters. + :param received: + The received input parameters. + + :raises ValueError: + If not all expected inputs are present in the received inputs + If the received inputs are not lists or have different lengths + """ + # Validate that all expected inputs are present in the received inputs + for param in expected.keys(): + if param not in received: + msg = f"LLM evaluator expected input parameter '{param}' but received only {received.keys()}." + raise ValueError(msg) + + # Validate that all received inputs are lists + if not all(isinstance(input, list) for input in received.values()): + msg = f"LLM evaluator expects all input values to be lists but received {[type(input) for input in received.values()]}." + raise ValueError(msg) + + # Validate that all received inputs are of the same length + inputs = received.values() + length = len(next(iter(inputs))) + if not all(len(input) == length for input in inputs): + msg = ( + f"LLM evaluator expects all input lists to have the same length but received {inputs} with lengths " + f"{[len(input) for input in inputs]}." + ) + raise ValueError(msg) + + @staticmethod + def validate_outputs(expected: List[str], received: str) -> None: + """ + Validate the output. + + :param expected: + Names of expected outputs + :param received: + Names of received outputs + + :raises ValueError: + If not all expected outputs are present in the received outputs + """ + parsed_output = json.loads(received) + if not all(output in parsed_output for output in expected): + msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." + raise ValueError(msg) diff --git a/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml b/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml new file mode 100644 index 0000000000..3b68f16a33 --- /dev/null +++ b/releasenotes/notes/llmevaluator-0ae63b2b9715fb9b.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add a new LLMEvaluator component that leverages LLMs through the OpenAI api to evaluate pipelines. diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py new file mode 100644 index 0000000000..df97d6c38e --- /dev/null +++ b/test/components/evaluators/test_llm_evaluator.py @@ -0,0 +1,331 @@ +from typing import List + +import pytest + +from haystack.components.evaluators import LLMEvaluator +from haystack.utils.auth import Secret + + +class TestLLMEvaluator: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + assert component.api == "openai" + assert component.generator.client.api_key == "test-api-key" + assert component.instructions == "test-instruction" + assert component.inputs == [("responses", List[str])] + assert component.outputs == ["score"] + assert component.examples == [ + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} + ] + + def test_init_fail_wo_openai_api_key(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + with pytest.raises(ValueError, match="None of the .* environment variables are set"): + LLMEvaluator( + api="openai", + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + + def test_init_with_parameters(self): + component = LLMEvaluator( + instructions="test-instruction", + api_key=Secret.from_token("test-api-key"), + inputs=[("responses", List[str])], + outputs=["custom_score"], + api="openai", + examples=[ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ], + ) + assert component.generator.client.api_key == "test-api-key" + assert component.api == "openai" + assert component.examples == [ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ] + assert component.instructions == "test-instruction" + assert component.inputs == [("responses", List[str])] + assert component.outputs == ["custom_score"] + + def test_init_with_invalid_parameters(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + # Invalid inputs + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs={("responses", List[str])}, + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[(List[str], "responses")], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[List[str]], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs={("responses", str)}, + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + + # Invalid outputs + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs="score", + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=[["score"]], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + + # Invalid examples + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples={ + "inputs": {"responses": "Damn, this is straight outta hell!!!"}, + "outputs": {"custom_score": 1}, + }, + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[ + [{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}] + ], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[ + {"wrong_key": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}} + ], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[ + { + "inputs": [{"responses": "Damn, this is straight outta hell!!!"}], + "outputs": [{"custom_score": 1}], + } + ], + ) + with pytest.raises(ValueError): + LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}], + ) + + def test_to_dict_default(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + data = component.to_dict() + assert data == { + "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", + "init_parameters": { + "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, + "api": "openai", + "instructions": "test-instruction", + "inputs": [("responses", List[str])], + "outputs": ["score"], + "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + }, + } + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + + data = { + "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", + "init_parameters": { + "api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"}, + "api": "openai", + "instructions": "test-instruction", + "inputs": [("responses", List[str])], + "outputs": ["score"], + "examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + }, + } + component = LLMEvaluator.from_dict(data) + assert component.api == "openai" + assert component.generator.client.api_key == "test-api-key" + assert component.instructions == "test-instruction" + assert component.inputs == [("responses", List[str])] + assert component.outputs == ["score"] + assert component.examples == [ + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}} + ] + + def test_to_dict_with_parameters(self, monkeypatch): + monkeypatch.setenv("ENV_VAR", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + api_key=Secret.from_env_var("ENV_VAR"), + inputs=[("responses", List[str])], + outputs=["custom_score"], + api="openai", + examples=[ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ], + ) + data = component.to_dict() + assert data == { + "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", + "init_parameters": { + "api_key": {"env_vars": ["ENV_VAR"], "strict": True, "type": "env_var"}, + "api": "openai", + "instructions": "test-instruction", + "inputs": [("responses", List[str])], + "outputs": ["custom_score"], + "examples": [ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, + ], + }, + } + + def test_run_with_different_lengths(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("questions", List[str]), ("responses", List[List[str]])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + + def generator_run(self, *args, **kwargs): + return {"replies": ['{"score": 0.5}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + with pytest.raises(ValueError): + component.run(questions=["What is the capital of Germany?"], responses=[["Berlin"], ["Paris"]]) + + with pytest.raises(ValueError): + component.run( + questions=["What is the capital of Germany?", "What is the capital of France?"], responses=[["Berlin"]] + ) + + def test_run_returns_parsed_result(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("questions", List[str]), ("responses", List[List[str]])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + + def generator_run(self, *args, **kwargs): + return {"replies": ['{"score": 0.5}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + results = component.run(questions=["What is the capital of Germany?"], responses=["Berlin"]) + assert results == {"results": [{"score": 0.5, "name": "llm"}]} + + def test_prepare_template(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[ + {"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, + {"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}, + ], + ) + template = component.prepare_template() + assert ( + template + == 'Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:\n["score"]\nConsider the instructions and the examples below to determine those values.\n\nExamples:\nInputs:\n{"responses": "Damn, this is straight outta hell!!!"}\nOutputs:\n{"score": 1}\nInputs:\n{"responses": "Football is the most popular sport."}\nOutputs:\n{"score": 0}\n\nInputs:\n{"responses": {{ responses }}}\nOutputs:\n' + ) + + def test_invalid_input_parameters(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + # None of the expected parameters are received + with pytest.raises(ValueError): + component.validate_input_parameters(expected={"responses": List[str]}, received={"questions": List[str]}) + + # Only one but not all the expected parameters are received + with pytest.raises(ValueError): + component.validate_input_parameters( + expected={"responses": List[str], "questions": List[str]}, received={"questions": List[str]} + ) + + # Received inputs are not lists + with pytest.raises(ValueError): + component.validate_input_parameters(expected={"questions": List[str]}, received={"questions": str}) + + def test_invalid_outputs(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + ) + with pytest.raises(ValueError): + component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}') + + with pytest.raises(ValueError): + component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}') + + def test_unsupported_api(self): + with pytest.raises(ValueError): + LLMEvaluator( + api="unsupported_api", + instructions="test-instruction", + inputs=[("responses", List[str])], + outputs=["score"], + examples=[{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}], + )