Merge pull request #245 from nicholaswold/nick/fixup-invalid-metadata

Strip metadata that is causing notebook validation to fail
jupyter · Jan 24, 2022 · 59e927d · 59e927d
2 parents d073357 + 884adeb
commit 59e927d
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 6 deletions.
diff --git a/nbformat/json_compat.py b/nbformat/json_compat.py
@@ -10,6 +10,7 @@
 import jsonschema
 from jsonschema import Draft4Validator as _JsonSchemaValidator
 from jsonschema import ValidationError
+from jsonschema import ErrorTree
 
 try:
     import fastjsonschema
@@ -33,6 +34,9 @@ def validate(self, data):
     def iter_errors(self, data, schema=None):
         return self._default_validator.iter_errors(data, schema)
 
+    def error_tree(self, errors):
+        return ErrorTree(errors)
+
 
 class FastJsonSchemaValidator(JsonSchemaValidator):
     name = "fastjsonschema"
@@ -60,6 +64,15 @@ def iter_errors(self, data, schema=None):
 
         return errors
 
+    def error_tree(self, errors):
+        # fastjsonschema's exceptions don't contain the same information that the jsonschema ValidationErrors
+        # do. This method is primarily used for introspecting metadata schema failures so that we can strip
+        # them if asked to do so in `nbformat.validate`.
+        # Another way forward for compatibility: we could distill both validator errors into a custom collection
+        # for this data. Since implementation details of ValidationError is used elsewhere, we would probably
+        # just use this data for schema introspection.
+        raise NotImplementedError("JSON schema error introspection not enabled for fastjsonschema")
+
 
 _VALIDATOR_MAP = [
     ("fastjsonschema", fastjsonschema, FastJsonSchemaValidator),

diff --git a/nbformat/tests/test_validator.py b/nbformat/tests/test_validator.py
@@ -7,6 +7,8 @@
 import os
 import re
 
+import nbformat
+
 from .base import TestsBase
 from jsonschema import ValidationError
 from nbformat import read
@@ -275,3 +277,11 @@ def test_notebook_invalid_without_min_version():
 
 def test_notebook_invalid_without_main_version():
     pass
+
+
+def test_strip_invalid_metadata():
+    with TestsBase.fopen(u'v4_5_invalid_metadata.ipynb', u'r') as f:
+        nb = nbformat.from_dict(json.load(f))
+    assert not isvalid(nb)
+    validate(nb, strip_invalid_metadata=True)
+    assert isvalid(nb)
diff --git a/nbformat/tests/v4_5_invalid_metadata.ipynb b/nbformat/tests/v4_5_invalid_metadata.ipynb
@@ -0,0 +1,64 @@
+{
+ "cells": [
+  {
+   "id": "asdf",
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": "BAD: should be bool",
+    "scrolled": "BAD: should be bool",
+    "deletable": "BAD: should be bool",
+    "editable": "BAD: should be bool",
+    "format": "BAD: should be bool",
+    "name": 12345,
+    "tags": "BAD: should be list of str",
+    "jupyter": "BAD: should be dict",
+    "execution": "BAD: should be dict"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'bar'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {
+      "isolated": "BAD: should be bool"
+     },
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"foo\""
+   ]
+  },
+  {
+   "id": "jlkj",
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": "BAD: should be dict",
+     "outputs_hidden": "BAD: should be dict"
+    },
+    "execution": {
+     "iopub.execute_input": "BAD: should be ISO 8601 format",
+     "iopub.status.busy": "BAD: should be ISO 8601 format",
+     "shell.execute_reply": "BAD: should be ISO 8601 format",
+     "iopub.status.idle": "BAD: should be ISO 8601 format"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"foo 2\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": "BAD: should be a dict",
+  "authors": "BAD: should be a list of dicts"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbformat/validator.py b/nbformat/validator.py
@@ -6,11 +6,10 @@
 import os
 import pprint
 import sys
-import warnings
 
 from ._imports import import_item
 from .json_compat import get_current_validator, ValidationError
-from .reader import get_version, reads
+from .reader import get_version
 from .corpus.words import generate_corpus_id
 
 from traitlets.log import get_logger
@@ -233,7 +232,8 @@ def better_validation_error(error, version, version_minor):
 
 
 def validate(nbdict=None, ref=None, version=None, version_minor=None,
-             relax_add_props=False, nbjson=None, repair_duplicate_cell_ids=True):
+             relax_add_props=False, nbjson=None, repair_duplicate_cell_ids=True,
+             strip_invalid_metadata=False):
     """Checks whether the given notebook dict-like object
     conforms to the relevant notebook format schema.
 
@@ -271,7 +271,8 @@ def validate(nbdict=None, ref=None, version=None, version_minor=None,
 
     for error in iter_validate(nbdict, ref=ref, version=version,
                                version_minor=version_minor,
-                               relax_add_props=relax_add_props):
+                               relax_add_props=relax_add_props,
+                               strip_invalid_metadata=strip_invalid_metadata):
         raise error
 
     if notebook_supports_cell_ids:
@@ -290,7 +291,7 @@ def validate(nbdict=None, ref=None, version=None, version_minor=None,
 
 
 def iter_validate(nbdict=None, ref=None, version=None, version_minor=None,
-                  relax_add_props=False, nbjson=None):
+                  relax_add_props=False, nbjson=None, strip_invalid_metadata=False):
     """Checks whether the given notebook dict-like object conforms to the
     relevant notebook format schema.
 
@@ -317,7 +318,39 @@ def iter_validate(nbdict=None, ref=None, version=None, version_minor=None,
     if ref:
         errors = validator.iter_errors(nbdict, {'$ref' : '#/definitions/%s' % ref})
     else:
-        errors = validator.iter_errors(nbdict)
+        errors = [e for e in validator.iter_errors(nbdict)]
+
+        if len(errors) > 0 and strip_invalid_metadata:
+            error_tree = validator.error_tree(errors)
+            if "metadata" in error_tree:
+                for key in error_tree["metadata"]:
+                    nbdict["metadata"].pop(key, None)
+
+            if "cells" in error_tree:
+                number_of_cells = len(nbdict.get("cells", 0))
+                for cell_idx in range(number_of_cells):
+                    # Cells don't report individual metadata keys as having failed validation
+                    # Instead it reports that it failed to validate against each cell-type definition.
+                    # We have to delve into why those definitions failed to uncover which metadata
+                    # keys are misbehaving.
+                    if "oneOf" in error_tree["cells"][cell_idx].errors:
+                        intended_cell_type = nbdict["cells"][cell_idx]["cell_type"]
+                        schemas_by_index = [ref["$ref"] for ref in error_tree["cells"][cell_idx].errors["oneOf"].schema["oneOf"]]
+                        cell_type_definition_name = f"#/definitions/{intended_cell_type}_cell"
+                        if cell_type_definition_name in schemas_by_index:
+                            schema_index = schemas_by_index.index(cell_type_definition_name)
+                            for error in error_tree["cells"][cell_idx].errors["oneOf"].context:
+                                rel_path = error.relative_path
+                                error_for_intended_schema = error.schema_path[0] == schema_index
+                                is_top_level_metadata_key = len(rel_path) == 2 and rel_path[0] == "metadata"
+                                if error_for_intended_schema and is_top_level_metadata_key:
+                                    nbdict["cells"][cell_idx]["metadata"].pop(rel_path[1], None)
+
+            # Validate one more time to ensure that us removing metadata
+            # didn't cause another complex validation issue in the schema.
+            # Also to ensure that higher-level errors produced by individual metadata validation
+            # failures are removed.
+            errors = validator.iter_errors(nbdict)
 
     for error in errors:
         yield better_validation_error(error, version, version_minor)