diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
index df22af7e0..a9637dee5 100644
--- a/agenta-backend/agenta_backend/services/evaluators_service.py
+++ b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -53,47 +53,6 @@ def validate_string_output(
     return output
 
 
-def validate_json_output(
-    evaluator_key: str, output: Union[str, Dict[str, Any]]
-) -> Union[str, dict]:
-    """Checks and validate the output to be of type JSON string or dictionary.
-
-    Args:
-        evaluator_key (str): the key of the evaluator
-        output (Union[str, Dict[str, Any]]): the llm response
-
-    Raises:
-        Exception: requires output to be a JSON string
-
-    Returns:
-        str, dict: output
-    """
-
-    output = output.get("data", "") if isinstance(output, dict) else output
-    if isinstance(output, dict):
-        output = json.dumps(output)
-    elif isinstance(output, str):
-        try:
-            json.loads(output)
-        except json.JSONDecodeError:
-            raise Exception(
-                f"Evaluator {evaluator_key} requires the output to be a JSON string or object."
-            )
-
-    if not isinstance(
-        output,
-        (
-            str,
-            dict,
-        ),
-    ):
-        raise Exception(
-            f"Evaluator {evaluator_key} requires the output to be either a JSON string or object, but received {type(output).__name__} instead."
-        )
-
-    return output
-
-
 async def map(
     mapping_input: EvaluatorMappingInputInterface,
 ) -> EvaluatorMappingOutputInterface:
@@ -684,7 +643,16 @@ async def auto_contains_json(
     lm_providers_keys: Dict[str, Any],  # pylint: disable=unused-argument
 ) -> Result:
     try:
-        output = validate_json_output("contains_json", output)
+        # parsing llm app output format if v2
+        output = output.get("data", "") if isinstance(output, dict) else output
+        if isinstance(output, dict):
+            output = json.dumps(
+                output
+            )  # contains_json expects inputs.prediction to be a string
+        elif not isinstance(output, (str, dict)):
+            raise Exception(
+                f"Evaluator contains_json requires the app output to be either a JSON string or object, but received {type(output).__name__} instead."
+            )
         response = await contains_json(
             input=EvaluatorInputInterface(**{"inputs": {"prediction": output}})
         )
@@ -707,7 +675,7 @@ async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterf
         potential_json = str(input.inputs["prediction"])[start_index:end_index]
         json.loads(potential_json)
         contains_json = True
-    except (ValueError, json.JSONDecodeError):
+    except (ValueError, json.JSONDecodeError) as e:
         contains_json = False
 
     return {"outputs": {"success": contains_json}}
@@ -825,8 +793,9 @@ async def auto_json_diff(
     lm_providers_keys: Dict[str, Any],  # pylint: disable=unused-argument
 ) -> Result:
     try:
-        output = validate_json_output("json_diff", output)
+        # 2. extract ground truth from data point
         correct_answer = get_correct_answer(data_point, settings_values)
+
         response = await json_diff(
             input=EvaluatorInputInterface(
                 **{
@@ -836,7 +805,16 @@ async def auto_json_diff(
             )
         )
         return Result(type="number", value=response["outputs"]["score"])
-    except (ValueError, json.JSONDecodeError, Exception):
+    except json.JSONDecodeError:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(
+                message="Expected answer is not a valid JSON",
+                stacktrace=traceback.format_exc(),
+            ),
+        )
+    except (ValueError, Exception):
         return Result(
             type="error",
             value=None,
@@ -848,12 +826,32 @@ async def auto_json_diff(
 
 
 async def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface:
-    average_score = compare_jsons(
-        ground_truth=input.inputs["ground_truth"],
-        app_output=json.loads(input.inputs["prediction"]),
+    ground_truth = input.inputs["ground_truth"]
+    if isinstance(ground_truth, str):
+        ground_truth = json.loads(ground_truth)  # if this fails we will return an error
+
+    # 1. extract llm app output if app output format is v2+
+    app_output = input.inputs["prediction"]
+    assert isinstance(
+        app_output, (str, dict)
+    ), "App output is expected to be a string or a JSON object"
+    app_output = (
+        app_output.get("data", "") if isinstance(app_output, dict) else app_output
+    )
+    if isinstance(app_output, str):
+        try:
+            app_output = json.loads(app_output)
+        except json.JSONDecodeError:
+            app_output = (
+                {}
+            )  # we will return 0 score for json diff in case we cannot parse the output as json
+
+    score = compare_jsons(
+        ground_truth=ground_truth,
+        app_output=app_output,
         settings_values=input.settings,
     )
-    return {"outputs": {"score": average_score}}
+    return {"outputs": {"score": score}}
 
 
 async def measure_rag_consistency(
diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
index 0714a34ef..0b4f65a00 100644
--- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
+++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
@@ -222,13 +222,13 @@ async def test_auto_contains_all(output, substrings, case_sensitive, expected):
 @pytest.mark.parametrize(
     "output, expected",
     [
-        ('Some random text {"key": "value"} more text', None),
-        ("No JSON here!", None),
-        ("{Malformed JSON, nope!}", None),
+        ('Some random text {"key": "value"} more text', True),
+        ("No JSON here!", False),
+        ("{Malformed JSON, nope!}", False),
         ('{"valid": "json", "number": 123}', True),
         ({"data": {"message": "The capital of Azerbaijan is Baku."}}, True),
         ({"data": '{"message": "The capital of Azerbaijan is Baku."}'}, True),
-        ({"data": "The capital of Azerbaijan is Baku."}, None),
+        ({"data": "The capital of Azerbaijan is Baku."}, False),
     ],
 )
 @pytest.mark.asyncio