Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics redesign #1389

Draft
wants to merge 38 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
07d3a19
First draft of redesign.
Liraim Dec 3, 2024
368724f
Add examples
Liraim Dec 3, 2024
cf38a34
Add examples
Liraim Dec 3, 2024
371aa0d
Add FeatureScorers.
Liraim Dec 6, 2024
4953ef5
Add metric container, group by and support for llm_judge in scorers.
Liraim Dec 9, 2024
6ccffa6
Fix example.
Liraim Dec 9, 2024
7644cb1
Fix example.
Liraim Dec 9, 2024
ede9486
Add custom scorers.
Liraim Dec 9, 2024
c194b4b
Add first version of presets and report.
Liraim Dec 11, 2024
c350d95
Fix column name for FeatureScorer.
Liraim Dec 12, 2024
92bd2f5
Add implementations for column_max, column_mean, column_min and colum…
Liraim Dec 12, 2024
e142196
Support MetricPreset and MetricContainer in Reports.
Liraim Dec 13, 2024
ccab7fc
codegen scorers (#1394)
mike0sv Dec 13, 2024
e35ef8e
Add separate render for metrics and checks.
Liraim Dec 16, 2024
d451be5
Merge branch 'refs/heads/main' into feature/metrics-redesign
Liraim Dec 17, 2024
6a52347
rename scoreres->descriptors, add showcase.ipynb and add sql descriptor
mike0sv Dec 17, 2024
a61f7af
Collect checks in single list.
Liraim Dec 18, 2024
cca2ad3
Add example F1 metric (from legacy metrics) in new reports.
Liraim Dec 20, 2024
863f12a
Add metrics ROC AUC, f1, precision, recall.
Liraim Dec 22, 2024
e8556db
Add same render as in old metrics to RocAucMetric.
Liraim Dec 23, 2024
51af315
Rename checks to tests and reorganize code.
Liraim Dec 23, 2024
a4903b1
Bring back column_summary metrics.
Liraim Dec 23, 2024
5b765b0
Add quality by class preset.
Liraim Dec 24, 2024
cb3caa8
Remove tabs title and rename Checks to Tests.
Liraim Dec 24, 2024
f894998
support v2 metrics in v1 snapshots (#1398)
mike0sv Dec 24, 2024
81029c4
add mylabelvalue
mike0sv Dec 25, 2024
2ce0121
fix ipython import
mike0sv Dec 25, 2024
c831a51
fix
mike0sv Dec 25, 2024
a80e253
fix deduplication
mike0sv Dec 25, 2024
30e1e79
Add support for ByLabel tests.
Liraim Dec 25, 2024
7fca28e
Update column statistics metrics: min, max, mean,median, quantile and…
Liraim Dec 25, 2024
71ebb54
Fix group_by metrics and metric_workbench examples.
Liraim Dec 25, 2024
b9e04fa
make all reports
mike0sv Dec 26, 2024
79341e9
Add distribution widgets to metrics.
Liraim Dec 27, 2024
d5732b3
Merge remote-tracking branch 'origin/feature/metrics-redesign' into f…
mike0sv Dec 27, 2024
4ce3ea8
test widgets
mike0sv Dec 27, 2024
1368ba1
Add metrics InRange, OutRange, InList, OutList and fix some bugs.
Liraim Dec 27, 2024
5c123bc
add count value support
mike0sv Dec 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 258 additions & 0 deletions examples/metric_workbench.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T18:07:57.136304Z",
"start_time": "2024-12-09T18:07:57.132965Z"
}
},
"outputs": [],
"source": [
"from typing import Dict\n",
"from typing import Union\n",
"\n",
"import pandas as pd\n",
"\n",
"from evidently import ColumnType\n",
"\n",
"from evidently.v2.datasets import DatasetColumn\n",
"\n",
"from evidently.v2.datasets import Scorer\n",
"from evidently.v2.metrics import Metric\n",
"from evidently.v2.datasets import Dataset\n",
"from evidently.v2.metrics import SingleValue\n",
"from evidently.v2.metrics import SingleValueCheck"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de4b011f992f1165",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T18:07:59.040196Z",
"start_time": "2024-12-09T18:07:59.036939Z"
}
},
"outputs": [],
"source": [
"from typing import Optional\n",
"\n",
"\n",
"class TextLengthScorer(Scorer):\n",
" def __init__(self, column_name: str, alias: Optional[str] = None):\n",
" super().__init__(alias or f\"{column_name}: Text Length\")\n",
" self._column_name = column_name\n",
"\n",
" def generate_data(self, dataset: \"Dataset\") -> Union[DatasetColumn, Dict[str, DatasetColumn]]:\n",
" lengths = dataset.column(self._column_name).data.apply(len)\n",
" return DatasetColumn(type=ColumnType.Numerical, data=lengths)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc99c9bcf41dd669",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T18:20:21.744281Z",
"start_time": "2024-12-09T18:20:21.738294Z"
}
},
"outputs": [],
"source": [
"class ToxicityScorer(Scorer):\n",
" def __init__(self, column_name: str, alias: Optional[str] = None):\n",
" super().__init__(alias or f\"{column_name}: Toxicity\")\n",
" self._column_name = column_name\n",
" \n",
" def generate_data(self, dataset: \"Dataset\") -> Union[DatasetColumn, Dict[str, DatasetColumn]]:\n",
" from evidently.descriptors import ToxicityLLMEval\n",
" from evidently.options.base import Options\n",
"\n",
" feature = ToxicityLLMEval().feature(self._column_name)\n",
" data = feature.generate_features(dataset.as_dataframe(), None, Options())\n",
" return {\n",
" col: DatasetColumn(type=feature.get_type(f\"{feature.get_fingerprint()}.{col}\"), data=data[col])\n",
" for col in data.columns\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48fc3742208e6385",
"metadata": {},
"outputs": [],
"source": [
"def my_scorer(data: DatasetColumn) -> DatasetColumn:\n",
" return DatasetColumn(type=ColumnType.Numerical, data=data.data)\n",
"\n",
"def my_scorer2(dataset: Dataset) -> Union[DatasetColumn, Dict[str, DatasetColumn]]:\n",
" return dataset.column(\"column_1\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "279361d330bca4d8",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T18:20:46.958375Z",
"start_time": "2024-12-09T18:20:42.774548Z"
}
},
"outputs": [],
"source": [
"from evidently.v2.scorers import CustomColumnScorer\n",
"from evidently.v2.scorers import CustomScorer\n",
"from evidently.v2.scorers import TextLength\n",
"\n",
"data = pd.DataFrame(data={\"column_1\": [1, 2, 3, 4, -1, 5], \"column_2\": [\"a\", \"aa\", \"aaaa\", \"aaaaaaa\", \"a\", \"aa\"]})\n",
"\n",
"dataset = Dataset.from_pandas(\n",
" data,\n",
" data_definition=None,\n",
" scorers=[\n",
" TextLength(\"column_2\", alias=\"column 2 length\"),\n",
" ToxicityScorer(\"column_2\"),\n",
" CustomColumnScorer(\"column_2\", my_scorer, alias=\"column 2 custom function\"),\n",
" CustomScorer(my_scorer2, alias=\"global custom function\"),\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd5aed998289f2f3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T18:20:48.836818Z",
"start_time": "2024-12-09T18:20:48.830392Z"
}
},
"outputs": [],
"source": [
"dataset.as_dataframe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f49a28098d1cad0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T17:44:07.086154Z",
"start_time": "2024-12-09T17:44:07.078132Z"
}
},
"outputs": [],
"source": [
"from evidently.descriptors import TextLength\n",
"\n",
"from evidently.v2.checks.numerical_checks import le, ge\n",
"from typing import Optional\n",
"from typing import List\n",
"from plotly.express import line\n",
"\n",
"# user-friendly interface\n",
"def max_metric(column_name: str, checks: Optional[List[SingleValueCheck]] = None, group_by: Optional[str] = None) -> Metric:\n",
" return MaxMetric(column_name, checks)\n",
"\n",
"\n",
"# implementation\n",
"class MaxMetric(Metric[SingleValue]):\n",
" _column_name: str\n",
" \n",
" def __init__(self, column_name: str, checks: Optional[List[SingleValueCheck]] = None):\n",
" super().__init__(f\"max:{column_name}\")\n",
" self._column_name = column_name\n",
" self._checks = checks if checks is not None else [le(10), ge(6)]\n",
"\n",
" def calculate(self, current_data: Dataset, reference_data: Optional[Dataset]) -> SingleValue:\n",
" x = current_data.column(self._column_name).data\n",
" value = x.max()\n",
" result = SingleValue(value=value)\n",
" figure = line(x)\n",
" figure.add_hrect(6, 10)\n",
" #result.set_widget([plotly_figure(title=self.display_name(), figure=figure)])\n",
" return result\n",
"\n",
" def display_name(self) -> str:\n",
" return f\"Max value for {self._column_name}\"\n",
"\n",
"\n",
"result = max_metric(\"column_1\", checks=None).call(dataset, None)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9566651ec9d1bee1",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T17:45:16.507805Z",
"start_time": "2024-12-09T17:45:16.316855Z"
}
},
"outputs": [],
"source": [
"from evidently.v2.metrics.group_by import GroupBy\n",
"from evidently.v2.metrics.base import render_results\n",
"from evidently.v2.report import Context\n",
"\n",
"context = Context()\n",
"\n",
"context.init_dataset(dataset, None)\n",
"\n",
"metrics = GroupBy(max_metric(\"column 2 length\"), \"column_1\").generate_metrics(context)\n",
"\n",
"results = [metric.call(dataset, None) for metric in metrics]\n",
"\n",
"render_results(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8e8eaadc002b5d1",
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-09T17:44:36.385681Z",
"start_time": "2024-12-09T17:44:36.376478Z"
}
},
"outputs": [],
"source": [
"results[0].value"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Empty file added src/evidently/v2/__init__.py
Empty file.
Empty file.
28 changes: 28 additions & 0 deletions src/evidently/v2/checks/numerical_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Union

from ..metrics.base import CheckResult
from ..metrics.base import SingleValue
from ..metrics.base import SingleValueCheck
from ..metrics.base import TestStatus


def le(threshold: Union[int, float]) -> SingleValueCheck:
def func(value: SingleValue):
return CheckResult(
f"Less or Equal {threshold}",
"",
TestStatus.SUCCESS if value.value <= threshold else TestStatus.FAIL,
)

return func


def ge(threshold: Union[int, float]) -> SingleValueCheck:
def func(value: SingleValue):
return CheckResult(
f"Greater or Equal {threshold}",
"",
TestStatus.SUCCESS if value.value >= threshold else TestStatus.FAIL,
)

return func
Loading
Loading