Skip to content

Commit

Permalink
Merge branch 'main' into docstrings-linting
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Apr 23, 2024
2 parents 7997915 + d7638cf commit f7ae004
Show file tree
Hide file tree
Showing 23 changed files with 477 additions and 79 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
name: Add new issues to project for triage
runs-on: ubuntu-latest
steps:
- uses: actions/[email protected].0
- uses: actions/[email protected].1
with:
project-url: https://github.com/orgs/deepset-ai/projects/5
github-token: ${{ secrets.GH_PROJECT_PAT }}
39 changes: 39 additions & 0 deletions docs/pydoc/config/evaluators_api.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/evaluators]
modules:
[
"answer_exact_match",
"context_relevance",
"document_map",
"document_mrr",
"document_recall",
"evaluation_result",
"document_recall",
"faithfulness",
"llm_evaluator",
"sas_evaluator",
]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
expression:
documented_only: true
do_not_filter_modules: false
skip_empty_modules: true
- type: smart
- type: crossref
renderer:
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
excerpt: Evaluate your pipelines or individual components.
category_slug: haystack-api
title: Evaluators
slug: evaluators-api
order: 5
markdown:
descriptive_class_title: false
classdef_code_block: false
descriptive_module_title: true
add_method_class_prefix: true
add_member_class_prefix: false
filename: evaluators_api.md
2 changes: 1 addition & 1 deletion haystack/components/caching/cache_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "CacheChecker":

try:
module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
logger.debug("Trying to import module '{module}'", module=module_name)
logger.debug("Trying to import module '{module_name}'", module_name=module_name)
module = importlib.import_module(module_name)
except (ImportError, DeserializationError) as e:
raise DeserializationError(
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .answer_exact_match import AnswerExactMatchEvaluator
from .context_relevance import ContextRelevanceEvaluator
from .document_map import DocumentMAPEvaluator
from .document_mrr import DocumentMRREvaluator
from .document_recall import DocumentRecallEvaluator
Expand All @@ -9,6 +10,7 @@

__all__ = [
"AnswerExactMatchEvaluator",
"ContextRelevanceEvaluator",
"DocumentMAPEvaluator",
"DocumentMRREvaluator",
"DocumentRecallEvaluator",
Expand Down
154 changes: 154 additions & 0 deletions haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from typing import Any, Dict, List, Optional

from numpy import mean as np_mean

from haystack import default_from_dict
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Private global variable for default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
"inputs": {
"questions": "What is the capital of Germany?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 0],
},
},
{
"inputs": {"questions": "What is the capital of France?", "contexts": ["Berlin is the capital of Germany."]},
"outputs": {"statements": ["Berlin is the capital of Germany."], "statement_scores": [0]},
},
{
"inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
"outputs": {"statements": ["Rome is the capital of Italy."], "statement_scores": [1]},
},
]


class ContextRelevanceEvaluator(LLMEvaluator):
"""
Evaluator that checks if a provided context is relevant to the question.
An LLM separates the answer into multiple statements and checks whether the statement can be inferred from the
context or not. The final score for the full answer is a number from 0.0 to 1.0. It represents the proportion of
statements that can be inferred from the provided contexts.
Usage example:
```python
from haystack.components.evaluators import ContextRelevanceEvaluator
questions = ["Who created the Python language?"]
contexts = [
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
],
]
evaluator = ContextRelevanceEvaluator()
result = evaluator.run(questions=questions, contexts=contexts)
print(result["score"])
# 1.0
print(result["individual_scores"])
# [1.0]
print(result["results"])
# [{'statements': ['Python, created by Guido van Rossum in the late 1980s.'], 'statement_scores': [1], 'score': 1.0}]
```
"""

def __init__(
self,
examples: Optional[List[Dict[str, Any]]] = None,
api: str = "openai",
api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
):
"""
Creates an instance of ContextRelevanceEvaluator.
:param examples:
Optional few-shot examples conforming to the expected input and output format of ContextRelevanceEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions" and "contexts".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expected format:
[{
"inputs": {
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
},
"outputs": {
"statements": ["Rome is the capital of Italy."],
"statement_scores": [1],
},
}]
:param api:
The API to use for calling an LLM through a Generator.
Supported APIs: "openai".
:param api_key:
The API key.
"""
self.instructions = (
"Your task is to judge how relevant the provided context is for answering a question. "
"First, please extract statements from the provided context. "
"Second, calculate a relevance score for each statement in the context. "
"The score is 1 if the statement is relevant to answer the question or 0 if it is not relevant."
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
self.api_key = api_key

super().__init__(
instructions=self.instructions,
inputs=self.inputs,
outputs=self.outputs,
examples=self.examples,
api=self.api,
api_key=self.api_key,
)

@component.output_types(results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param questions:
A list of questions.
:param contexts:
A list of lists of contexts. Each list of contexts corresponds to one question.
:returns:
A dictionary with the following outputs:
- `score`: Mean context relevance score over all the provided input questions.
- `individual_scores`: A list of context relevance scores for each input question.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input context.
"""
result = super().run(questions=questions, contexts=contexts)

# calculate average statement relevance score per query
for res in result["results"]:
res["score"] = np_mean(res["statement_scores"])

# calculate average context relevance score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
result["individual_scores"] = [res["score"] for res in result["results"]]

return result

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ContextRelevanceEvaluator":
"""
Deserialize this component from a dictionary.
:param data:
The dictionary representation of this component.
:returns:
The deserialized component instance.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)
82 changes: 45 additions & 37 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,40 @@
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]


class FaithfulnessEvaluator(LLMEvaluator):
"""
Expand Down Expand Up @@ -50,7 +84,8 @@ def __init__(
Creates an instance of FaithfulnessEvaluator.
:param examples:
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expand Down Expand Up @@ -81,38 +116,7 @@ def __init__(
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
self.api_key = api_key

Expand All @@ -126,19 +130,23 @@ def __init__(
)

@component.output_types(results=List[Dict[str, Any]])
def run(self, **inputs) -> Dict[str, Any]:
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param inputs:
The input values to evaluate. The keys are the input names and the values are lists of input values.
:param questions:
A list of questions.
:param contexts:
A nested list of contexts that correspond to the questions.
:param responses:
A list of responses.
:returns:
A dictionary with the following outputs:
- `score`: Mean faithfulness score over all the provided input answers.
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(**inputs)
result = super().run(questions=questions, contexts=contexts, responses=responses)

# calculate average statement faithfulness score per query
for res in result["results"]:
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/retrievers/filter_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "FilterRetriever":
raise DeserializationError("Missing 'type' in document store's serialization data")
try:
module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
logger.debug("Trying to import module '{module}'", module=module_name)
logger.debug("Trying to import module '{module_name}'", module_name=module_name)
module = importlib.import_module(module_name)
except (ImportError, DeserializationError) as e:
raise DeserializationError(
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/writers/document_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentWriter":

try:
module_name, type_ = init_params["document_store"]["type"].rsplit(".", 1)
logger.debug("Trying to import module '{module}'", module=module_name)
logger.debug("Trying to import module '{module_name}'", module_name=module_name)
module = importlib.import_module(module_name)
except (ImportError, DeserializationError) as e:
raise DeserializationError(
Expand Down
6 changes: 3 additions & 3 deletions haystack/core/component/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,10 @@ def copy_class_namespace(namespace):
if class_path in self.registry:
# Corner case, but it may occur easily in notebooks when re-running cells.
logger.debug(
"Component {component} is already registered. Previous imported from '{module}', new imported from '{new_module}'",
"Component {component} is already registered. Previous imported from '{module_name}', new imported from '{new_module_name}'",
component=class_path,
module=self.registry[class_path],
new_module=cls,
module_name=self.registry[class_path],
new_module_name=cls,
)
self.registry[class_path] = cls
logger.debug("Registered Component {component}", component=cls)
Expand Down
Loading

0 comments on commit f7ae004

Please sign in to comment.