Skip to content

Commit

Permalink
Add metrics InRange, OutRange, InList, OutList and fix some bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
Liraim committed Dec 27, 2024
1 parent 4ce3ea8 commit 1368ba1
Show file tree
Hide file tree
Showing 6 changed files with 1,029 additions and 93 deletions.
824 changes: 778 additions & 46 deletions examples/list_metrics.ipynb

Large diffs are not rendered by default.

87 changes: 46 additions & 41 deletions examples/metric_workbench.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/evidently/v2/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,11 @@ def from_pandas(
new_column = descriptor.generate_data(dataset)
if isinstance(new_column, DatasetColumn):
data[key] = new_column.data
dataset._data_definition.columns[key] = ColumnInfo(new_column.type)
elif len(new_column) > 1:
for col, value in new_column.items():
data[f"{key}.{col}"] = value.data
dataset._data_definition.columns[f"{key}.{col}"] = ColumnInfo(value.type)
else:
data[key] = list(new_column.values())[0].data
return dataset
Expand Down
11 changes: 10 additions & 1 deletion src/evidently/v2/descriptors/_text_length.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Any
from typing import Dict
from typing import Optional
from typing import Union

import numpy as np

from evidently import ColumnType
from evidently.v2.datasets import Dataset
from evidently.v2.datasets import DatasetColumn
Expand All @@ -14,5 +17,11 @@ def __init__(self, column_name: str, alias: Optional[str] = None):
self._column_name: str = column_name

def generate_data(self, dataset: "Dataset") -> Union[DatasetColumn, Dict[str, DatasetColumn]]:
column_items_lengths = dataset.as_dataframe()[self._column_name].apply(len)
column_items_lengths = dataset.as_dataframe()[self._column_name].apply(_apply)
return DatasetColumn(type=ColumnType.Numerical, data=column_items_lengths)


def _apply(value: Any):
if value is None or (isinstance(value, float) and np.isnan(value)):
return 0
return len(value)
55 changes: 50 additions & 5 deletions src/evidently/v2/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,18 @@ def get_label_result(self, label: Label) -> SingleValue:
return SingleValue(value)


@dataclasses.dataclass
class CountValue(MetricResult):
count: int
share: float

def get_count(self) -> SingleValue:
return SingleValue(self.count)

def get_share(self) -> SingleValue:
return SingleValue(self.share)


class MetricTest(Protocol[TResult]):
def __call__(self, metric: "Metric", value: TResult) -> MetricTestResult: ...

Expand All @@ -137,10 +149,6 @@ class SingleValueMetricTest(MetricTest[SingleValue], Protocol):
def __call__(self, metric: "Metric", value: SingleValue) -> MetricTestResult: ...


class ByLabelValueMetricTest(MetricTest[ByLabelValue], Protocol):
def __call__(self, metric: "Metric", value: ByLabelValue) -> MetricTestResult: ...


MetricId = str

ByLabelValueTests = typing.Dict[Label, List[SingleValueMetricTest]]
Expand Down Expand Up @@ -178,6 +186,19 @@ def get_default_render(title: str, result: TResult) -> List[BaseWidgetInfo]:
return [
table_data(title=title, column_names=["Label", "Value"], data=[(k, v) for k, v in result.values.items()])
]
if isinstance(result, CountValue):
return [
counter(
title=f"{title}: count",
size=WidgetSize.HALF,
counters=[CounterData(label="", value=str(result.count))],
),
counter(
title=f"{title}: share",
size=WidgetSize.HALF,
counters=[CounterData(label="", value=f"{result.share:.2f}")],
),
]
raise NotImplementedError(f"No default render for {type(result)}")


Expand Down Expand Up @@ -292,7 +313,7 @@ class ByLabelMetric(Metric[ByLabelValue], ABC):
_tests: Optional[typing.Dict[Label, List[SingleValueMetricTest]]]

def label_metric(self, label: Label) -> SingleValueMetric:
return
raise NotImplementedError()

def with_tests(self, tests: Optional[typing.Dict[Label, List[SingleValueMetricTest]]]):
self._tests = tests
Expand All @@ -305,3 +326,27 @@ def get_tests(self, value: ByLabelValue) -> Generator[MetricTestResult, None, No
label_value = value.get_label_result(label)
for test in tests:
yield test(self, label_value)


class CountMetric(Metric[CountValue], ABC):
_count_tests: Optional[List[SingleValueMetricTest]] = None
_share_tests: Optional[List[SingleValueMetricTest]] = None

def with_tests(
self,
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
self._count_tests = count_tests
self._share_tests = share_tests
return self

def get_tests(self, value: CountValue) -> Generator[MetricTestResult, None, None]:
if self._count_tests is None and self._share_tests is None:
return None
count_value = value.get_count()
for test in self._count_tests or []:
yield test(self, count_value)
share_value = value.get_share()
for test in self._share_tests or []:
yield test(self, share_value)
143 changes: 143 additions & 0 deletions src/evidently/v2/metrics/column_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Union

from evidently.metric_results import HistogramData
from evidently.metric_results import Label
from evidently.model.widget import BaseWidgetInfo
from evidently.options import ColorOptions
from evidently.renderers.html_widgets import WidgetSize
Expand All @@ -14,6 +15,8 @@
from evidently.v2.datasets import DatasetColumn
from evidently.v2.metrics import SingleValue
from evidently.v2.metrics import SingleValueMetricTest
from evidently.v2.metrics.base import CountMetric
from evidently.v2.metrics.base import CountValue
from evidently.v2.metrics.base import SingleValueMetric


Expand Down Expand Up @@ -141,3 +144,143 @@ def calculate_value(self, column: DatasetColumn) -> Union[float, int]:

def display_name(self) -> str:
return f"Quantile {self._quantile} of {self._column}"


class CategoryCount(CountMetric):
def __init__(
self,
column: str,
category: Label,
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"category_count:{column}:{category}")
self._column = column
self._category = category
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.value_counts()[self._category]
total = column.data.count()
return CountValue(value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' category '{self._category}'"


class InRangeValueCount(CountMetric):
def __init__(
self,
column: str,
left: Union[int, float],
right: Union[int, float],
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"in_range:{column}:{left}:{right}")
self._column = column
self._left = left
self._right = right
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.between(self._left, self._right).count()
total = column.data.count()
return CountValue(value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' values in range {self._left} to {self._right}"


class OutRangeValueCount(CountMetric):
def __init__(
self,
column: str,
left: Union[int, float],
right: Union[int, float],
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"out_range:{column}:{left}:{right}")
self._column = column
self._left = left
self._right = right
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.between(self._left, self._right).count()
total = column.data.count()
return CountValue(total - value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' values out of range {self._left} to {self._right}"


class InListValueCount(CountMetric):
def __init__(
self,
column: str,
values: List[Label],
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"in_list:{column}:{'|'.join(values)}")
self._column = column
self._values = values
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.value_counts()[self._values].sum()
total = column.data.count()
return CountValue(value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' values in list [{', '.join(self._values)}]"


class OutListValueCount(CountMetric):
def __init__(
self,
column: str,
values: List[Label],
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"out_list:{column}:{'|'.join(values)}")
self._column = column
self._values = values
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.value_counts()[self._values].sum()
total = column.data.count()
return CountValue(total - value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' values out of list [{', '.join(self._values)}]"


class MissingValueCount(CountMetric):
def __init__(
self,
column: str,
count_tests: Optional[List[SingleValueMetricTest]] = None,
share_tests: Optional[List[SingleValueMetricTest]] = None,
):
super().__init__(f"missing_values:{column}")
self._column = column
self.with_tests(count_tests, share_tests)

def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> CountValue:
column = current_data.column(self._column)
value = column.data.count()
total = len(column.data)
return CountValue(total - value, value / total)

def display_name(self) -> str:
return f"Column '{self._column}' missing values"

0 comments on commit 1368ba1

Please sign in to comment.