added JSONMatch Descriptor (#1337)

evidentlyai · Oct 25, 2024 · 08d1502 · 08d1502
1 parent 35038bf
commit 08d1502
Show file tree

Hide file tree

Showing 7 changed files with 125 additions and 0 deletions.
diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md
@@ -279,6 +279,7 @@ Check for regular expression matches.
 | **ExactMatch()** <ul><li>Checks if the text matches between two columns.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExactMatch(column_name='column_1')`| **Required:** <br>`with_column` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 | **IsValidJSON()** <ul><li>Checks if the text in a specified column is a valid JSON.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidJSON(column_name='column_1')`| **Required:** <br>`column_name` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 | **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**).  </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
+| **JSONMatch()** <ul><li>Compares two columns of a dataframe and checks whether the two objects in each row of the dataframe are matching JSON's or not. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `JSONMatch(with_column="column_2")`| **Required:** <br> `with_column : str` <br><br>**Optional:**<ul><li>`display_name`</li> |
 | **ContainsLink()** <ul><li>Checks if the text contains at least one valid URL. </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ContainsLink(column_name='column_1')`| **Required:** <br>`column_name: str`<br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 
 ## Descriptors: Text stats

diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -6,6 +6,7 @@
 from .hf_descriptor import HuggingFaceModel
 from .hf_descriptor import HuggingFaceToxicityModel
 from .is_valid_json_descriptor import IsValidJSON
+from .json_match_descriptor import JSONMatch
 from .json_schema_match_descriptor import JSONSchemaMatch
 from .llm_judges import BiasLLMEval
 from .llm_judges import ContextQualityLLMEval
@@ -72,4 +73,5 @@
     "IsValidJSON",
     "JSONSchemaMatch",
     "_registry",
+    "JSONMatch",
 ]
diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py
@@ -117,6 +117,11 @@
     "evidently.descriptors.custom_descriptor.CustomPairColumnEval",
     "evidently:descriptor:CustomPairColumnEval",
 )
+register_type_alias(
+    FeatureDescriptor,
+    "evidently.descriptors.json_match_descriptor.JSONMatch",
+    "evidently:descriptor:JSONMatch",
+)
 register_type_alias(
     FeatureDescriptor,
     "evidently.descriptors.contains_link_descriptor.ContainsLink",

diff --git a/src/evidently/descriptors/json_match_descriptor.py b/src/evidently/descriptors/json_match_descriptor.py
@@ -0,0 +1,13 @@
+from evidently.features import json_match_feature
+from evidently.features.generated_features import FeatureDescriptor
+from evidently.features.generated_features import GeneratedFeature
+
+
+class JSONMatch(FeatureDescriptor):
+    class Config:
+        type_alias = "evidently:descriptor:JSONMatch"
+
+    with_column: str
+
+    def feature(self, column_name: str) -> GeneratedFeature:
+        return json_match_feature.JSONMatch(first_column=column_name, second_column=self.with_column)
diff --git a/src/evidently/features/_registry.py b/src/evidently/features/_registry.py
@@ -90,6 +90,7 @@
 register_type_alias(
     GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence"
 )
+register_type_alias(GeneratedFeatures, "evidently.features.json_match_feature.JSONMatch", "evidently:feature:JSONMatch")
 register_type_alias(
     GeneratedFeatures, "evidently.features.contains_link_feature.ContainsLink", "evidently:feature:ContainsLink"
 )

diff --git a/src/evidently/features/json_match_feature.py b/src/evidently/features/json_match_feature.py
@@ -0,0 +1,46 @@
+import json
+
+import pandas as pd
+
+from evidently import ColumnType
+from evidently.base_metric import ColumnName
+from evidently.features.generated_features import FeatureTypeFieldMixin
+from evidently.features.generated_features import GeneratedFeature
+from evidently.utils.data_preprocessing import DataDefinition
+
+
+class JSONMatch(FeatureTypeFieldMixin, GeneratedFeature):
+    class Config:
+        type_alias = "evidently:feature:JSONMatch"
+
+    first_column: str
+    second_column: str
+    feature_type: ColumnType = ColumnType.Categorical
+
+    def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
+        def compare_json_objects(first_json_object: str, second_json_object: str) -> bool:
+            try:
+                # Load both JSON strings into dictionaries
+                first_json = json.loads(first_json_object)
+                second_json = json.loads(second_json_object)
+
+                # Compare dictionaries for equality, ignoring order of keys
+                return first_json == second_json
+
+            except (ValueError, TypeError):
+                # Return False if either of the JSONs is invalid
+                return False
+
+        data[self._feature_column_name()] = data.apply(
+            lambda x: compare_json_objects(x[self.first_column], x[self.second_column]), axis=1
+        )
+        return pd.DataFrame(data[self._feature_column_name()])
+
+    def _as_column(self) -> "ColumnName":
+        return self._create_column(
+            self._feature_column_name(),
+            default_display_name=f"JSON match for columns {self.first_column} and {self.second_column}",
+        )
+
+    def _feature_column_name(self):
+        return f"JSON match for {self.first_column} and {self.second_column}"
diff --git a/tests/features/test_json_match.py b/tests/features/test_json_match.py
@@ -0,0 +1,57 @@
+import pandas as pd
+
+from evidently.features.json_match_feature import JSONMatch
+from evidently.pipeline.column_mapping import ColumnMapping
+from evidently.utils.data_preprocessing import create_data_definition
+
+
+def test_is_valid_sql_feature():
+    feature_generator = JSONMatch(
+        first_column="col_1", second_column="col_2", display_name="Json Match", feature_type="num", name="is_json_match"
+    )
+
+    # Define JSON strings for each scenario
+    scenarios = [
+        # Scenario 1 - Matching JSONs
+        ('{"name": "Alice", "age": 25, "city": "London"}', '{"city": "London", "age": 25, "name": "Alice"}'),
+        # Scenario 2 - Different whitespace (still matching)
+        ('{ "name" : "Bob" , "age" : 22 , "city" : "Paris" }', '{"city": "Paris", "name": "Bob", "age": 22}'),
+        # Scenario 3 - Invalid JSON in one column
+        (
+            '{"name": "Eve", "age": 28, "city": "Berlin"}',
+            '{"city": "Berlin", "age": 28, "name": Eve}',
+        ),  # Missing quotes around "Eve"
+        # Scenario 4 - Keys mismatch
+        (
+            '{"name": "Charlie", "age": 30, "country": "USA"}',
+            '{"name": "Charlie", "age": 30, "city": "USA"}',
+        ),  # 'country' vs 'city'
+        # Scenario 5 - Values mismatch
+        (
+            '{"name": "David", "age": 35, "city": "Tokyo"}',
+            '{"city": "Tokyo", "age": 35, "name": "Daniel"}',
+        ),  # 'David' vs 'Daniel'
+    ]
+
+    # Create DataFrame
+    data = pd.DataFrame(scenarios, columns=["col_1", "col_2"])
+
+    result = feature_generator.generate_feature(
+        data=data,
+        data_definition=create_data_definition(None, data, ColumnMapping()),
+    )
+
+    expected_result = pd.DataFrame(dict(is_json_match=[True, True, False, False, False]))
+
+    print(result)
+
+    print(expected_result)
+
+    try:
+        assert result.equals(expected_result)
+        return True
+    except AssertionError:
+        return False
+
+
+print(test_is_valid_sql_feature())