Skip to content

Commit

Permalink
Add ItemMatch + ItemNoMatch Descriptors (#1338)
Browse files Browse the repository at this point in the history
  • Loading branch information
jon-bown authored Oct 14, 2024
1 parent 1a3728d commit 634568b
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ Check for regular expression matches.
| **DoesNotContain()** <ul><li>Checks if the text does not contain any or all specified items. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `DoesNotContain(items=["as a large language model"]` | **Required:** <br> `items: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`case_sensitive = True` or `False`</li></ul> |
| **IncludesWords()** <ul><li> Checks if the text includes **any** (default) or **all** specified words. </li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li> By default, considers inflected and variant forms of the same word. </li><li> Returns True/False for every input. </li></ul> Example use:<br> `IncludesWords(words_list=['booking', 'hotel', 'flight']` | **Required:** <br> `words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'any'` or `'all'`</li><li>`lemmatize = True` or `False`</li></ul> |
| **ExcludesWords()** <ul><li>Checks if the text excludes all specified words.</li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li>By default, considers inflected and variant forms of the same word. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExcludesWords(words_list=['buy', 'sell', 'bet']`| **Required:** <br>`words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`lemmatize = True` or `False`</li></ul> |
| **ItemMatch()** <ul><li>Checks whether the text contains **any** (default) or **all** specified items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="expected")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
| **ItemNoMatch()** <ul><li>Checks whether the text excludes **any** (default) or **all** specified items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="forbidden")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |

## Descriptors: Text stats

Expand Down
4 changes: 4 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from .sentiment_descriptor import Sentiment
from .text_contains_descriptor import Contains
from .text_contains_descriptor import DoesNotContain
from .text_contains_descriptor import ItemMatch
from .text_contains_descriptor import ItemNoMatch
from .text_length_descriptor import TextLength
from .text_part_descriptor import BeginsWith
from .text_part_descriptor import EndsWith
Expand Down Expand Up @@ -47,6 +49,8 @@
"EndsWith",
"DoesNotContain",
"IncludesWords",
"ItemMatch",
"ItemNoMatch",
"ExcludesWords",
"TextLength",
"TriggerWordsPresence",
Expand Down
6 changes: 6 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@
"evidently.descriptors.text_contains_descriptor.DoesNotContain",
"evidently:descriptor:DoesNotContain",
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemMatch", "evidently:descriptor:ItemMatch"
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_contains_descriptor.ItemNoMatch", "evidently:descriptor:ItemNoMatch"
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.text_length_descriptor.TextLength", "evidently:descriptor:TextLength"
)
Expand Down
34 changes: 34 additions & 0 deletions src/evidently/descriptors/text_contains_descriptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,37 @@ def feature(self, column_name: str) -> GeneratedFeature:
self.mode,
self.display_name,
)


class ItemMatch(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:ItemMatch"

with_column: str
mode: str = "any"
case_sensitive: bool = True

def feature(self, column_name: str) -> GeneratedFeature:
return text_contains_feature.ItemMatch(
columns=[column_name, self.with_column],
case_sensitive=self.case_sensitive,
mode=self.mode,
display_name=self.display_name,
)


class ItemNoMatch(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:ItemNoMatch"

with_column: str
mode: str = "any"
case_sensitive: bool = True

def feature(self, column_name: str) -> GeneratedFeature:
return text_contains_feature.ItemNoMatch(
columns=[column_name, self.with_column],
case_sensitive=self.case_sensitive,
mode=self.mode,
display_name=self.display_name,
)
6 changes: 6 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.DoesNotContain", "evidently:feature:DoesNotContain"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.ItemMatch", "evidently:feature:ItemMatch"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_contains_feature.ItemNoMatch", "evidently:feature:ItemNoMatch"
)
register_type_alias(
GeneratedFeatures, "evidently.features.text_length_feature.TextLength", "evidently:feature:TextLength"
)
Expand Down
106 changes: 106 additions & 0 deletions src/evidently/features/text_contains_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,109 @@ def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()


class ItemMatch(GeneratedFeature):
class Config:
type_alias = "evidently:feature:ItemMatch"

__feature_type__: ClassVar = ColumnType.Categorical
columns: List[str]
case_sensitive: bool
mode: str

def __init__(
self,
columns: List[str],
case_sensitive: bool = True,
mode: str = "any",
display_name: Optional[str] = None,
):
if len(columns) != 2:
raise ValueError("two columns must be provided")
self.columns = columns
self.display_name = display_name
self.case_sensitive = case_sensitive
if mode not in ["any", "all"]:
raise ValueError("mode must be either 'any' or 'all'")
self.mode = mode
super().__init__()

def _feature_column_name(self) -> str:
return f"{self.columns[0]}_{self.columns[1]}" + "_item_match_" + str(self.case_sensitive) + "_" + self.mode

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
if self.mode == "any":
calculated = data.apply(
lambda row: any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
else:
calculated = data.apply(
lambda row: all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
return pd.DataFrame({self._feature_column_name(): calculated})

def _as_column(self) -> ColumnName:
return self._create_column(
self._feature_column_name(),
default_display_name=f"Text contains {self.mode} of defined items",
)

def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()


class ItemNoMatch(GeneratedFeature):
class Config:
type_alias = "evidently:feature:ItemNoMatch"

__feature_type__: ClassVar = ColumnType.Categorical
columns: List[str]
case_sensitive: bool
mode: str

def __init__(
self,
columns: List[str],
case_sensitive: bool = True,
mode: str = "any",
display_name: Optional[str] = None,
):
self.columns = columns
self.display_name = display_name
self.case_sensitive = case_sensitive
if mode not in ["any", "all"]:
raise ValueError("mode must be either 'any' or 'all'")
self.mode = mode
super().__init__()

def _feature_column_name(self) -> str:
return f"{self.columns[0]}_{self.columns[1]}" + "_item_no_match_" + str(self.case_sensitive) + "_" + self.mode

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
if self.mode == "any":
calculated = data.apply(
lambda row: not any(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
else:
calculated = data.apply(
lambda row: not all(self.comparison(word, row[self.columns[0]]) for word in row[self.columns[1]]),
axis=1,
)
return pd.DataFrame({self._feature_column_name(): calculated})

def _as_column(self) -> ColumnName:
return self._create_column(
self._feature_column_name(),
default_display_name=f"Text does not contain {self.mode} of defined items",
)

def comparison(self, item: str, string: str):
if self.case_sensitive:
return item in string
return item.casefold() in string.casefold()
82 changes: 82 additions & 0 deletions tests/features/test_text_contains_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from evidently.features.text_contains_feature import Contains
from evidently.features.text_contains_feature import DoesNotContain
from evidently.features.text_contains_feature import ItemMatch
from evidently.features.text_contains_feature import ItemNoMatch
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition

Expand Down Expand Up @@ -61,3 +63,83 @@ def test_text_not_contains_feature(items: List[str], case: bool, mode: str, expe
column_expected = feature_generator._feature_column_name()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)


@pytest.mark.parametrize(
("case", "mode", "expected"),
[
(True, "any", [False, True, False, True, False]),
(True, "all", [False, True, False, False, False]),
(False, "any", [True, True, True, True, False]),
(False, "all", [False, True, True, False, False]),
],
)
def test_item_match(case: bool, mode: str, expected: List[bool]):
data = {
"generated": [
"You should consider purchasing Nike or Adidas shoes.",
"I eat apples, grapes, and oranges",
"grapes, oranges, apples.",
"Oranges are more sour than grapes.",
"This test doesn't have the words.",
],
"expected": [
["nike", "adidas", "puma"],
["grapes", "apples", "oranges"],
["Apples", "Oranges", "Grapes"],
["orange", "sweet", "grape"],
["none", "of", "these"],
],
}
df = pd.DataFrame(data)
df["expected"] = df["expected"].apply(tuple)
feature_generator = ItemMatch(columns=["generated", "expected"], case_sensitive=case, mode=mode)
result = feature_generator.generate_feature(
data=df,
data_definition=create_data_definition(None, df, ColumnMapping()),
)
column_expected = feature_generator._feature_column_name()
column_name_obj = feature_generator._as_column()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)
assert column_name_obj.display_name == f"Text contains {mode} of defined items"


@pytest.mark.parametrize(
("case", "mode", "expected"),
[
(True, "any", [True, False, True, False, True]),
(True, "all", [True, False, True, True, True]),
(False, "any", [False, False, False, False, True]),
(False, "all", [True, False, False, True, True]),
],
)
def test_item_no_match(case: bool, mode: str, expected: List[bool]):
data = {
"generated": [
"You should consider purchasing Nike or Adidas shoes.",
"I eat apples, grapes, and oranges",
"grapes, oranges, apples.",
"Oranges are more sour than grapes.",
"This test doesn't have the words.",
],
"forbidden": [
["nike", "adidas", "puma"],
["grapes", "apples", "oranges"],
["Apples", "Oranges", "Grapes"],
["orange", "sweet", "grape"],
["none", "of", "these"],
],
}
feature_generator = ItemNoMatch(columns=["generated", "forbidden"], case_sensitive=case, mode=mode)
df = pd.DataFrame(data)
df["forbidden"] = df["forbidden"].apply(tuple)
result = feature_generator.generate_feature(
data=df,
data_definition=create_data_definition(None, df, ColumnMapping()),
)
column_expected = feature_generator._feature_column_name()
column_name_obj = feature_generator._as_column()
expected_df = pd.DataFrame({column_expected: expected})
assert result.equals(expected_df)
assert column_name_obj.display_name == f"Text does not contain {mode} of defined items"

0 comments on commit 634568b

Please sign in to comment.