Skip to content

Commit

Permalink
feat: use chunk data in NIAH and QA evals (#1176)
Browse files Browse the repository at this point in the history
* incorporate chunk data into NIAH retrieval metric
* add chunk_rank metric
* add DeepEval metrics
* fix NIAH padding bug
  • Loading branch information
jalling97 authored Oct 7, 2024
1 parent b9f6413 commit ad697cd
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 32 deletions.
2 changes: 1 addition & 1 deletion src/leapfrogai_evals/.env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev"
LEAPFROGAI_API_KEY="lfai-api-key"
ANTHROPIC_API_KEY="anthropic-api-key"

Expand Down
3 changes: 2 additions & 1 deletion src/leapfrogai_evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cp .env.example .env
Within `.env`, replace the necessary environment variables:

```bash
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev for development>
LEAPFROGAI_API_KEY=<LeapfrogAI API key>
ANTHROPIC_API_KEY=<Anthropic API key>
```
Expand Down Expand Up @@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process:
- prompt the LLM to provide the secret code hidden in the context
- record the following:
- whether or not the needle text was returned by the retrieval step of RAG
- which chunk from the retrieval step the needle was found in, if present
- whether or not the needle text was returned by the LLM's final response
- delete the contextual document from the vector store
- delete the assistant
Expand Down
6 changes: 4 additions & 2 deletions src/leapfrogai_evals/evals/niah_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank
from leapfrogai_evals.runners import NIAH_Runner


Expand All @@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict:
additional_metadata={
"retrieval_score": row["retrieval_score"],
"response_score": row["response_score"],
"chunk_rank": row["chunk_rank"],
},
)
)
Expand All @@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict:
# TODO: Give ability to choose which metrics to run
retrieval_metric = NIAH_Retrieval()
response_metric = NIAH_Response()
metrics = [retrieval_metric, response_metric]
chunk_rank_metric = NIAH_Chunk_Rank()
metrics = [retrieval_metric, response_metric, chunk_rank_metric]

# record scores and return results
for metric in metrics:
Expand Down
12 changes: 10 additions & 2 deletions src/leapfrogai_evals/evals/qa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
import numpy as np
import os

from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import (
AnswerRelevancyMetric,
ContextualRelevancyMetric,
FaithfulnessMetric,
)
from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
Expand All @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict:
actual_output=row["actual_output"],
context=row["context"],
expected_output=row["expected_output"],
retrieval_context=row["retrieval_context"],
additional_metadata={
"actual_annotations": row["actual_annotations"],
"expected_annotations": row["expected_annotations"],
},
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
)
)

Expand All @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict:
# TODO: Give ability to choose which metrics to run
correctness_metric = CorrectnessMetric(model=judge_model)
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model)
faithfulness_metric = FaithfulnessMetric(model=judge_model)
annotation_relevancy_metric = AnnotationRelevancyMetric()
metrics = [
correctness_metric,
answer_relevancy_metric,
contextual_relevancy_metric,
faithfulness_metric,
annotation_relevancy_metric,
]

Expand Down
6 changes: 5 additions & 1 deletion src/leapfrogai_evals/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@

from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
from leapfrogai_evals.metrics.correctness import CorrectnessMetric
from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval
from leapfrogai_evals.metrics.niah_metrics import (
NIAH_Response,
NIAH_Retrieval,
NIAH_Chunk_Rank,
)
54 changes: 54 additions & 0 deletions src/leapfrogai_evals/metrics/niah_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,57 @@ def is_successful(self) -> bool:
@property
def __name__(self):
return "Needle in a Haystack (NIAH) Response"


class NIAH_Chunk_Rank(BaseMetric):
"""A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation"""

def __init__(
self,
threshold: float = 1.0,
async_mode: bool = True,
):
self.threshold = threshold
self.async_mode = async_mode

def measure(self, test_case: LLMTestCase) -> int:
"""
Records the niah chunk_rank from the test case
This function checks for the presence of a chunk rank (provided by the niah_runner)
and sets a boolean determined by said score. The score is calculated in the runner to keep the
runner self-contained as a means of running the entire evaluation on its own. For simplicity,
the score is copied here for integration with DeepEval.
params:
-------
test_case: LLMTestCase
A test case object built from the results of a needle in a haystack evaluation run.
test_case should contain an additional metadata field that returns a dictionary with
the field "chunk_rank"
returns:
-------
int
A score that is equal to the "chunk_rank" from the test_case
"""
self.score = test_case.additional_metadata["chunk_rank"]
self.success = self.score >= self.threshold

if self.success:
self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
else:
self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}"

return self.score

async def a_measure(self, test_case: LLMTestCase) -> int:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.measure, test_case)

def is_successful(self) -> bool:
return self.success

@property
def __name__(self):
return "Needle in a Haystack (NIAH) Chunk Rank"
2 changes: 1 addition & 1 deletion src/leapfrogai_evals/models/lfai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(
):
self.model = model or os.getenv("MODEL_TO_EVALUATE")
self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY")
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL")
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1"
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)

def load_model(self):
Expand Down
92 changes: 71 additions & 21 deletions src/leapfrogai_evals/runners/niah_runner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import ast
import logging
import numpy as np
import os
import openai
import requests

from datasets import load_dataset, concatenate_datasets
from distutils.util import strtobool
Expand Down Expand Up @@ -78,7 +80,7 @@ def __init__(
)

self.client = openai.OpenAI(
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"),
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1",
api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"),
)
logging.info(f"client url: {self.client.base_url}")
Expand All @@ -91,8 +93,6 @@ def __init__(
num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)),
)
self._create_vector_store()
self.retrieval_score = None
self.response_score = None

def run_experiment(self, cleanup: bool = True) -> None:
"""
Expand All @@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None:
try:
retrieval_scores = []
response_scores = []
chunk_ranks = []
response_contents = []

for row in tqdm(self.niah_data, desc="Evaluating data rows"):
Expand Down Expand Up @@ -162,32 +163,51 @@ def run_experiment(self, cleanup: bool = True) -> None:

retrieval_score = 0.0
response_score = 0.0
chunk_rank = 0.0
response_content = ""

for response in response_messages:
response_content += response.content[0].text.value + "\n"
secret_code = row["secret_code"]
chunk_ids = ast.literal_eval(response.metadata["vector_ids"])

# retrieval_score
# 1 if needle text was returned by the retrieval step of RAG else 0
logging.debug(
f"number of annotations in response: {len(response.content[0].text.annotations)}"
)
for annotation in response.content[0].text.annotations:
annotation_id = annotation.file_citation.file_id
if annotation_id == self.current_file:
logging.debug("Setting retrieval_score to 1.0")
# 1 if needle text is found in any chunk in the context, else 0
# chunk_rank
# see _calculate_chunk_rank for explanation
for chunk_num, chunk_id in enumerate(chunk_ids):
logging.info(f"chunk {chunk_num} (id: {chunk_id})")
vector_response = requests.get(
url=os.getenv("LEAPFROGAI_API_URL")
+ "/leapfrogai/v1/vector_stores/vector/"
+ chunk_id,
headers={
"accept": "application/json",
"Authorization": "Bearer "
+ os.getenv("LEAPFROGAI_API_KEY"),
},
).json()
logging.info(f"chunk_data: {vector_response['content']}")

if secret_code in vector_response["content"]:
logging.info(
f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}"
)
chunk_rank = self._calculate_chunk_rank(
chunk_place=chunk_num, total_chunks=len(chunk_ids)
)
retrieval_score = 1.0

# # response_score
# # 1 if needle text was returned by the LLM's final response else 0
secret_code = row["secret_code"]
# response_score
# 1 if needle text was returned by the LLM's final response else 0
logging.info(f"Response message: {response.content[0].text.value}")
if secret_code in response.content[0].text.value:
logging.debug("Setting response_score to 1.0")
response_score = 1.0

retrieval_scores.append(retrieval_score)
response_scores.append(response_score)
chunk_ranks.append(chunk_rank)
response_contents.append(response_content)

# delete file to clean up the vector store
Expand All @@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None:
self.niah_data = self.niah_data.add_column(
name="response_score", column=response_scores
)
self.niah_data = self.niah_data.add_column(
name="chunk_rank", column=chunk_ranks
)
self.niah_data = self.niah_data.add_column(
name="response", column=response_contents
)

self.retrieval_score = np.mean(retrieval_scores)
self.response_score = np.mean(response_scores)

logging.info(f"Retrieval Score {self.retrieval_score}")
logging.info(f"Response Score {self.response_score}")
logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}")
logging.info(f"Response Score: {np.mean(response_scores)}")
logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}")

# remove artifacts from the API if the experiment fails
except Exception as exc:
Expand Down Expand Up @@ -264,7 +285,8 @@ def _load_niah_dataset(
"""
logging.info(f"Downloading dataset: {dataset_name} from HuggingFace")
niah_dataset = load_dataset(dataset_name)
self.padding = niah_dataset["padding"]
if self.add_padding:
self.padding = niah_dataset["padding"]
niah_dataset = concatenate_datasets(
[
niah_dataset["base_eval"],
Expand Down Expand Up @@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore:
logging.debug(
f"Added {len(self.padding)} files as padding to the haystack vector store"
)
self.padding = self.padding.add_column(
name="padding_id", column=padding_ids
)

self.vector_store = vector_store
self.padding = self.padding.add_column(name="padding_id", column=padding_ids)

def _delete_vector_store(self, vector_store_id: str) -> None:
"""Deletes the vector store used for all NIAH evaluations"""
Expand All @@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None:
file_id=file_id, vector_store_id=self.vector_store.id
)
self.client.files.delete(file_id=file_id)

def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float:
"""
Calculate an individual chunk's rank
When a needle is found in a certain chunk, we caclulate the rank of that chunk
This rank is based on what place in the responses it came (between 0 and total_chunks-1)
using this formula:
chunk_rank_score = (total_chunks - chunk_place) / total_chunks
e.g
total_chunks = 5
chunk_place = 0 (first in the list)
chunk_rank_score = (5 - 0) / 5 = 1.0
e.g
total_chunks = 5
chunk_place = 4 (last in 0 indexed list)
chunk_rank_score = (5 - 4) / 5 = 0.2
not finding the needle results in a score of 0 (set outside this function)
"""
chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks)
return chunk_rank_score
Loading

0 comments on commit ad697cd

Please sign in to comment.