Skip to content

Commit

Permalink
Merge pull request #573 from confident-ai/hotfix/spinner
Browse files Browse the repository at this point in the history
added new spinner
  • Loading branch information
penguine-ip authored Mar 8, 2024
2 parents 54881c5 + 137d7ad commit 34e4688
Show file tree
Hide file tree
Showing 21 changed files with 875 additions and 796 deletions.
3 changes: 1 addition & 2 deletions deepeval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ._version import __version__

from deepeval.event import track
from deepeval.evaluate import evaluate, run_test, assert_test
from deepeval.evaluate import evaluate, assert_test
from deepeval.test_run import on_test_run_end, log_hyperparameters
from deepeval.utils import login_with_confident_api_key
from deepeval.telemetry import *
Expand All @@ -15,7 +15,6 @@
"log_hyperparameters",
"track",
"evaluate",
"run_test",
"assert_test",
"on_test_run_end",
]
Expand Down
74 changes: 27 additions & 47 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@

from deepeval.utils import drop_and_copy, get_or_create_event_loop
from deepeval.telemetry import capture_evaluation_count
from deepeval.progress_context import progress_context
from deepeval.metrics import BaseMetric
from deepeval.metrics.indicator import (
measure_metrics_with_indicator,
)
from deepeval.test_case import LLMTestCase
from deepeval.tracing import get_trace_stack
from deepeval.constants import PYTEST_RUN_TEST_NAME
from deepeval.test_run import test_run_manager, APITestCase, MetricsMetadata
from deepeval.utils import get_is_running_deepeval
from deepeval.utils import get_is_running_deepeval, disable_indicator


@dataclass
Expand Down Expand Up @@ -127,11 +129,7 @@ async def a_execute_test_cases(
api_test_case: APITestCase = create_api_test_case(test_case, index)
test_start_time = time.perf_counter()

# Run metrics concurrently using asyncio.gather
await asyncio.gather(
*[metric.a_measure(test_case) for metric in metrics]
)

await measure_metrics_with_indicator(metrics, test_case)
for metric in metrics:
metric_metadata = MetricsMetadata(
metric=metric.__name__,
Expand Down Expand Up @@ -164,29 +162,6 @@ async def a_execute_test_cases(
return test_results


def run_test(
test_case: LLMTestCase,
metrics: List[BaseMetric],
) -> List[TestResult]:
# TODO: refactor
for metric in metrics:
if not isinstance(metric, BaseMetric):
raise TypeError("Provided 'metric' must be of type 'BaseMetric'.")

# TODO: refactor
if not isinstance(test_case, LLMTestCase):
raise TypeError("'test_case' must be an instance of 'LLMTestCase'.")

test_run_manager.reset()
with progress_context("Executing run_test()..."):
test_result = execute_test_cases([test_case], metrics, False)[0]
capture_evaluation_count()
print_test_result(test_result)
print("")
print("-" * 70)
return test_result


def assert_test(
test_case: LLMTestCase, metrics: List[BaseMetric], run_async: bool = True
):
Expand Down Expand Up @@ -218,7 +193,7 @@ def assert_test(
]
failed_metrics_str = ", ".join(
[
f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold})"
f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, strict: {metric.strict_mode})"
for metric in failed_metrics
]
)
Expand All @@ -229,7 +204,11 @@ def evaluate(
test_cases: List[LLMTestCase],
metrics: List[BaseMetric],
run_async: bool = True,
show_indicator: bool = True,
):
if show_indicator is False:
disable_indicator()

# TODO: refactor
for metric in metrics:
if not isinstance(metric, BaseMetric):
Expand All @@ -243,22 +222,23 @@ def evaluate(
)

test_run_manager.reset()
with progress_context("Evaluating testcases..."):
if run_async:
loop = get_or_create_event_loop()
test_results = loop.run_until_complete(
a_execute_test_cases(test_cases, metrics, True)
)
else:
test_results = execute_test_cases(test_cases, metrics, True)
capture_evaluation_count()
for test_result in test_results:
print_test_result(test_result)
print("")
print("-" * 70)

test_run_manager.wrap_up_test_run(display_table=False)
return test_results
print("Evaluating test cases...")
if run_async:
loop = get_or_create_event_loop()
test_results = loop.run_until_complete(
a_execute_test_cases(test_cases, metrics, True)
)
else:
test_results = execute_test_cases(test_cases, metrics, True)

capture_evaluation_count()
for test_result in test_results:
print_test_result(test_result)

print("")
print("-" * 70)
test_run_manager.wrap_up_test_run(display_table=False)
return test_results


def print_test_result(test_result: TestResult):
Expand Down
17 changes: 4 additions & 13 deletions deepeval/metrics/answer_relevancy/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from deepeval.metrics import BaseMetric
from deepeval.models import GPTModel, DeepEvalBaseLLM
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type

required_params: List[LLMTestCaseParams] = [
Expand Down Expand Up @@ -46,12 +46,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):
if self.run_async:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand All @@ -74,12 +69,8 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self, is_async=True, _show_indicator=_show_indicator
):
self.statements: List[str] = await self._a_generate_statements(
test_case.actual_output
Expand Down
19 changes: 6 additions & 13 deletions deepeval/metrics/bias/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type
from deepeval.models import GPTModel, DeepEvalBaseLLM
from deepeval.utils import (
Expand Down Expand Up @@ -47,12 +47,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):
if self.run_async:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand All @@ -73,12 +68,10 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self,
is_async=True,
_show_indicator=_show_indicator,
):
self.opinions: List[str] = await self._a_generate_opinions(
test_case.actual_output
Expand Down
19 changes: 6 additions & 13 deletions deepeval/metrics/contextual_precision/contextual_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from deepeval.metrics.contextual_precision.template import (
ContextualPrecisionTemplate,
)
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type


Expand Down Expand Up @@ -50,12 +50,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):

if self.run_async:
loop = get_or_create_event_loop()
Expand All @@ -80,12 +75,10 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self,
is_async=True,
_show_indicator=_show_indicator,
):
self.verdicts: List[ContextualPrecisionVerdict] = (
await self._a_generate_verdicts(
Expand Down
19 changes: 6 additions & 13 deletions deepeval/metrics/contextual_recall/contextual_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from deepeval.metrics import BaseMetric
from deepeval.models import GPTModel, DeepEvalBaseLLM
from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type

required_params: List[LLMTestCaseParams] = [
Expand Down Expand Up @@ -47,12 +47,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):
if self.run_async:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand All @@ -74,12 +69,10 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self,
is_async=True,
_show_indicator=_show_indicator,
):
self.verdicts: List[ContextualRecallVerdict] = (
await self._a_generate_verdicts(
Expand Down
19 changes: 6 additions & 13 deletions deepeval/metrics/contextual_relevancy/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from deepeval.metrics.contextual_relevancy.template import (
ContextualRelevancyTemplate,
)
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type

required_params: List[LLMTestCaseParams] = [
Expand Down Expand Up @@ -49,12 +49,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):
if self.run_async:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand All @@ -76,12 +71,10 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self,
is_async=True,
_show_indicator=_show_indicator,
):
self.verdicts: List[ContextualRelevancyVerdict] = (
await self._a_generate_verdicts(
Expand Down
4 changes: 3 additions & 1 deletion deepeval/metrics/cost.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def measure(self, test_case: LLMTestCase):
capture_metric_type(self.__name__)
return self.score

async def a_measure(self, test_case: LLMTestCase):
async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
):
self.success = test_case.cost <= self.threshold
self.score = test_case.cost
capture_metric_type(self.__name__)
Expand Down
17 changes: 4 additions & 13 deletions deepeval/metrics/faithfulness/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from deepeval.models import GPTModel, DeepEvalBaseLLM
from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
from deepeval.progress_context import metrics_progress_context
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.telemetry import capture_metric_type

required_params: List[LLMTestCaseParams] = [
Expand Down Expand Up @@ -47,12 +47,7 @@ def __init__(

def measure(self, test_case: LLMTestCase) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
self.run_async,
):
with metric_progress_indicator(self):
if self.run_async:
loop = get_or_create_event_loop()
loop.run_until_complete(
Expand All @@ -72,12 +67,8 @@ async def a_measure(
self, test_case: LLMTestCase, _show_indicator: bool = True
) -> float:
check_test_case_params(test_case, required_params, self.__name__)
with metrics_progress_context(
self.__name__,
self.evaluation_model,
self.strict_mode,
True,
_show_indicator,
with metric_progress_indicator(
self, is_async=True, _show_indicator=_show_indicator
):
self.truths, self.claims = await asyncio.gather(
self._a_generate_truths(test_case.retrieval_context),
Expand Down
Loading

0 comments on commit 34e4688

Please sign in to comment.