[unit test] Adding unit test for metrics.get_accuracy (openai#224)

Adding a unit test to get the ball rolling, starting with metrics since they are fundamental to evaluating performance. :) It would be great to add some more tests when building out more, and also enable CI (e.g., via GitHub actions). This also fixes an unused param to `get_bootstrap_accuracy_std`.
TeneryResearch · Jun 2, 2023 · 36c2c74 · 36c2c74
1 parent cde88c0
commit 36c2c74
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 1 deletion.
diff --git a/evals/metrics.py b/evals/metrics.py
@@ -23,7 +23,7 @@ def get_accuracy(events: Sequence[Event]) -> float:
 
 def get_bootstrap_accuracy_std(events: Sequence[Event], num_samples: int = 1000):
     vals = [m.data["correct"] for m in events]
-    return np.std([np.mean(random.sample(vals, len(vals) // 2)) for _ in range(1000)])
+    return np.std([np.mean(random.sample(vals, len(vals) // 2)) for _ in range(num_samples)])
 
 
 def get_confusion_matrix(

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "pyyaml",
     "sacrebleu",
     "matplotlib",
+    "pytest",
     "setuptools_scm",
     "langchain"
 ]

diff --git a/tests/unit/evals/test_metrics.py b/tests/unit/evals/test_metrics.py
@@ -0,0 +1,24 @@
+from typing import List
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+from evals import metrics
+
+
+@pytest.mark.parametrize(
+    "event_labels, expected",
+    [
+        ([True, True], 1.0),
+        ([True, False, False], 0.333),
+        ([False, False], 0.0),
+        ([], np.nan),
+    ],
+)
+def test_get_accuracy(
+    event_labels: List[bool],
+    expected: float,
+) -> None:
+    events = [MagicMock(data={"correct": value}) for value in event_labels]
+    np.testing.assert_allclose(expected, metrics.get_accuracy(events), rtol=1e-3)