Skip to content

Commit

Permalink
added JSONMatch Descriptor (#1337)
Browse files Browse the repository at this point in the history
  • Loading branch information
ramachaitanya0 authored Oct 25, 2024
1 parent 35038bf commit 08d1502
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ Check for regular expression matches.
| **ExactMatch()** <ul><li>Checks if the text matches between two columns.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExactMatch(column_name='column_1')`| **Required:** <br>`with_column` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **IsValidJSON()** <ul><li>Checks if the text in a specified column is a valid JSON.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidJSON(column_name='column_1')`| **Required:** <br>`column_name` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**). </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
| **JSONMatch()** <ul><li>Compares two columns of a dataframe and checks whether the two objects in each row of the dataframe are matching JSON's or not. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `JSONMatch(with_column="column_2")`| **Required:** <br> `with_column : str` <br><br>**Optional:**<ul><li>`display_name`</li> |
| **ContainsLink()** <ul><li>Checks if the text contains at least one valid URL. </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ContainsLink(column_name='column_1')`| **Required:** <br>`column_name: str`<br><br>**Optional:**<ul><li>`display_name`</li></ul> |

## Descriptors: Text stats
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .hf_descriptor import HuggingFaceModel
from .hf_descriptor import HuggingFaceToxicityModel
from .is_valid_json_descriptor import IsValidJSON
from .json_match_descriptor import JSONMatch
from .json_schema_match_descriptor import JSONSchemaMatch
from .llm_judges import BiasLLMEval
from .llm_judges import ContextQualityLLMEval
Expand Down Expand Up @@ -72,4 +73,5 @@
"IsValidJSON",
"JSONSchemaMatch",
"_registry",
"JSONMatch",
]
5 changes: 5 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@
"evidently.descriptors.custom_descriptor.CustomPairColumnEval",
"evidently:descriptor:CustomPairColumnEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.json_match_descriptor.JSONMatch",
"evidently:descriptor:JSONMatch",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.contains_link_descriptor.ContainsLink",
Expand Down
13 changes: 13 additions & 0 deletions src/evidently/descriptors/json_match_descriptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from evidently.features import json_match_feature
from evidently.features.generated_features import FeatureDescriptor
from evidently.features.generated_features import GeneratedFeature


class JSONMatch(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:JSONMatch"

with_column: str

def feature(self, column_name: str) -> GeneratedFeature:
return json_match_feature.JSONMatch(first_column=column_name, second_column=self.with_column)
1 change: 1 addition & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
register_type_alias(
GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence"
)
register_type_alias(GeneratedFeatures, "evidently.features.json_match_feature.JSONMatch", "evidently:feature:JSONMatch")
register_type_alias(
GeneratedFeatures, "evidently.features.contains_link_feature.ContainsLink", "evidently:feature:ContainsLink"
)
Expand Down
46 changes: 46 additions & 0 deletions src/evidently/features/json_match_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json

import pandas as pd

from evidently import ColumnType
from evidently.base_metric import ColumnName
from evidently.features.generated_features import FeatureTypeFieldMixin
from evidently.features.generated_features import GeneratedFeature
from evidently.utils.data_preprocessing import DataDefinition


class JSONMatch(FeatureTypeFieldMixin, GeneratedFeature):
class Config:
type_alias = "evidently:feature:JSONMatch"

first_column: str
second_column: str
feature_type: ColumnType = ColumnType.Categorical

def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
def compare_json_objects(first_json_object: str, second_json_object: str) -> bool:
try:
# Load both JSON strings into dictionaries
first_json = json.loads(first_json_object)
second_json = json.loads(second_json_object)

# Compare dictionaries for equality, ignoring order of keys
return first_json == second_json

except (ValueError, TypeError):
# Return False if either of the JSONs is invalid
return False

data[self._feature_column_name()] = data.apply(
lambda x: compare_json_objects(x[self.first_column], x[self.second_column]), axis=1
)
return pd.DataFrame(data[self._feature_column_name()])

def _as_column(self) -> "ColumnName":
return self._create_column(
self._feature_column_name(),
default_display_name=f"JSON match for columns {self.first_column} and {self.second_column}",
)

def _feature_column_name(self):
return f"JSON match for {self.first_column} and {self.second_column}"
57 changes: 57 additions & 0 deletions tests/features/test_json_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd

from evidently.features.json_match_feature import JSONMatch
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition


def test_is_valid_sql_feature():
feature_generator = JSONMatch(
first_column="col_1", second_column="col_2", display_name="Json Match", feature_type="num", name="is_json_match"
)

# Define JSON strings for each scenario
scenarios = [
# Scenario 1 - Matching JSONs
('{"name": "Alice", "age": 25, "city": "London"}', '{"city": "London", "age": 25, "name": "Alice"}'),
# Scenario 2 - Different whitespace (still matching)
('{ "name" : "Bob" , "age" : 22 , "city" : "Paris" }', '{"city": "Paris", "name": "Bob", "age": 22}'),
# Scenario 3 - Invalid JSON in one column
(
'{"name": "Eve", "age": 28, "city": "Berlin"}',
'{"city": "Berlin", "age": 28, "name": Eve}',
), # Missing quotes around "Eve"
# Scenario 4 - Keys mismatch
(
'{"name": "Charlie", "age": 30, "country": "USA"}',
'{"name": "Charlie", "age": 30, "city": "USA"}',
), # 'country' vs 'city'
# Scenario 5 - Values mismatch
(
'{"name": "David", "age": 35, "city": "Tokyo"}',
'{"city": "Tokyo", "age": 35, "name": "Daniel"}',
), # 'David' vs 'Daniel'
]

# Create DataFrame
data = pd.DataFrame(scenarios, columns=["col_1", "col_2"])

result = feature_generator.generate_feature(
data=data,
data_definition=create_data_definition(None, data, ColumnMapping()),
)

expected_result = pd.DataFrame(dict(is_json_match=[True, True, False, False, False]))

print(result)

print(expected_result)

try:
assert result.equals(expected_result)
return True
except AssertionError:
return False


print(test_is_valid_sql_feature())

0 comments on commit 08d1502

Please sign in to comment.