diff --git a/Makefile b/Makefile index 19fde54a4..ed3e2ddcc 100644 --- a/Makefile +++ b/Makefile @@ -28,3 +28,6 @@ run-ci: format lint type ## Running all CI checks run-benchmarks: ## Run benchmarks @echo "Running benchmarks..." @cd $(GIT_ROOT)/tests/benchmarks && python benchmark.py +test: ## Run tests + @echo "Running tests..." + @pytest tests/unit diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 6cc1c03e8..a612ab3b2 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -26,9 +26,46 @@ def get_evaluation_mode(ds: Dataset): def evaluate( dataset: Dataset, - metrics: list[Metric], + metrics: list[Metric] | None = None, ) -> Result: - """ """ + """ + Run the evaluation on the dataset with different metrics + + Parameters + ---------- + dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]] + The dataset in the format of ragas which the metrics will use to score the RAG + pipeline with + + metrics : list[Metric] , optional + List of metrics to use for evaluation. If not provided then ragas will run the + evaluation on the best set of metrics to give a complete view. + + Returns + ------- + result : Result + Result object containing the scores of each metric. You can use this do analysis + later. If the top 3 metrics are provided then it also returns the `ragas_score` + for the entire pipeline. + + Examples + -------- + the basic usage is as follows: + ``` + from ragas import evaluate + + >>> dataset + Dataset({ + features: ['question', 'ground_truths', 'answer', 'contexts'], + num_rows: 30 + }) + + >>> result = evaluate(dataset) + >>> print(result["ragas_score"]) + {'ragas_score': 0.860, 'context_relavency': 0.817, 'factuality': 0.892, + 'answer_relevancy': 0.874} + ``` + """ if dataset is None: raise ValueError("Provide dataset!") @@ -37,6 +74,11 @@ def evaluate( # TODO: check if all the metrics are compatible with the evaluation mode + if metrics is None: + from ragas.metrics import answer_relevancy, context_relevancy, factuality + + metrics = [answer_relevancy, context_relevancy, factuality] + # run the evaluation on dataset with different metrics # initialize all the models in the metrics [m.init_model() for m in metrics] @@ -45,12 +87,14 @@ def evaluate( for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) - return Result(concatenate_datasets(scores, axis=1)) + return Result(scores=concatenate_datasets(scores, axis=1), dataset=dataset) @dataclass class Result(dict): scores: Dataset + dataset: Dataset | None = None + ragas_score: float | None = None def __post_init__(self): values = [] @@ -77,5 +121,17 @@ def describe(self): } return description + def to_pandas(self, batch_size: int | None = None, batched: bool = False): + if self.dataset is None: + raise ValueError("dataset is not provided for the results class") + assert self.scores.shape[0] == self.dataset.shape[0] + result_ds = concatenate_datasets([self.dataset, self.scores], axis=1) + + return result_ds.to_pandas(batch_size=batch_size, batched=batched) + def __repr__(self) -> str: - return super().__repr__() + scores = self.copy() + ragas_score = scores.pop("ragas_score") + score_strs = [f"'ragas_score': {ragas_score:0.3f}"] + score_strs.extend([f"'{k}': {v:0.3f}" for k, v in scores.items()]) + return "{" + ", ".join(score_strs) + "}" diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index c1bdacadd..90f7c3930 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -14,6 +14,21 @@ from datasets import Dataset +def make_batches(total_size: int, batch_size: int) -> list[range]: + """ + Take a total size and batch size and return a list of ranges for the batches + """ + tail = total_size % batch_size + num_batches = floor(total_size / batch_size) + batches = [ + range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size) + ] + if tail != 0: + batches.append(range(batch_size * num_batches, batch_size * num_batches + tail)) + + return batches + + @dataclass class Metric(ABC): @property @@ -40,18 +55,5 @@ def init_model(): def score(self: t.Self, dataset: Dataset) -> Dataset: ... - def get_batches(self, dataset_size: int): - tail = dataset_size % self.batch_size - num_batches = floor(dataset_size / self.batch_size) - batches = [ - range(i, i + self.batch_size) - for i in range(0, self.batch_size * num_batches, self.batch_size) - ] - if tail != 0: - batches.append( - range( - self.batch_size * num_batches, self.batch_size * num_batches + tail - ) - ) - - return batches + def get_batches(self, dataset_size: int) -> list[range]: + return make_batches(dataset_size, self.batch_size) diff --git a/src/ragas/metrics/factual.py b/src/ragas/metrics/factual.py index 71db8dcac..b1ede6935 100644 --- a/src/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -62,7 +62,7 @@ class Factuality(Metric): @property def name(self): - return "NLI_score" + return "factuality" def init_model(self: t.Self): pass diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py new file mode 100644 index 000000000..056524952 --- /dev/null +++ b/tests/unit/test_metric.py @@ -0,0 +1,11 @@ +import pytest + +from ragas.metrics.base import make_batches + + +@pytest.mark.parametrize( + "batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)] +) +def test_make_batches(batch_size, total_size, len_expected): + batches = make_batches(total_size, batch_size) + assert len(batches) == len_expected