Skip to content

Commit

Permalink
Merge pull request #245 from nicholaswold/nick/fixup-invalid-metadata
Browse files Browse the repository at this point in the history
Strip metadata that is causing notebook validation to fail
  • Loading branch information
ivanov authored Jan 24, 2022
2 parents d073357 + 884adeb commit 59e927d
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 6 deletions.
13 changes: 13 additions & 0 deletions nbformat/json_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import jsonschema
from jsonschema import Draft4Validator as _JsonSchemaValidator
from jsonschema import ValidationError
from jsonschema import ErrorTree

try:
import fastjsonschema
Expand All @@ -33,6 +34,9 @@ def validate(self, data):
def iter_errors(self, data, schema=None):
return self._default_validator.iter_errors(data, schema)

def error_tree(self, errors):
return ErrorTree(errors)


class FastJsonSchemaValidator(JsonSchemaValidator):
name = "fastjsonschema"
Expand Down Expand Up @@ -60,6 +64,15 @@ def iter_errors(self, data, schema=None):

return errors

def error_tree(self, errors):
# fastjsonschema's exceptions don't contain the same information that the jsonschema ValidationErrors
# do. This method is primarily used for introspecting metadata schema failures so that we can strip
# them if asked to do so in `nbformat.validate`.
# Another way forward for compatibility: we could distill both validator errors into a custom collection
# for this data. Since implementation details of ValidationError is used elsewhere, we would probably
# just use this data for schema introspection.
raise NotImplementedError("JSON schema error introspection not enabled for fastjsonschema")


_VALIDATOR_MAP = [
("fastjsonschema", fastjsonschema, FastJsonSchemaValidator),
Expand Down
10 changes: 10 additions & 0 deletions nbformat/tests/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import os
import re

import nbformat

from .base import TestsBase
from jsonschema import ValidationError
from nbformat import read
Expand Down Expand Up @@ -275,3 +277,11 @@ def test_notebook_invalid_without_min_version():

def test_notebook_invalid_without_main_version():
pass


def test_strip_invalid_metadata():
with TestsBase.fopen(u'v4_5_invalid_metadata.ipynb', u'r') as f:
nb = nbformat.from_dict(json.load(f))
assert not isvalid(nb)
validate(nb, strip_invalid_metadata=True)
assert isvalid(nb)
64 changes: 64 additions & 0 deletions nbformat/tests/v4_5_invalid_metadata.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"cells": [
{
"id": "asdf",
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": "BAD: should be bool",
"scrolled": "BAD: should be bool",
"deletable": "BAD: should be bool",
"editable": "BAD: should be bool",
"format": "BAD: should be bool",
"name": 12345,
"tags": "BAD: should be list of str",
"jupyter": "BAD: should be dict",
"execution": "BAD: should be dict"
},
"outputs": [
{
"data": {
"text/plain": [
"'bar'"
]
},
"execution_count": 1,
"metadata": {
"isolated": "BAD: should be bool"
},
"output_type": "execute_result"
}
],
"source": [
"\"foo\""
]
},
{
"id": "jlkj",
"cell_type": "code",
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": "BAD: should be dict",
"outputs_hidden": "BAD: should be dict"
},
"execution": {
"iopub.execute_input": "BAD: should be ISO 8601 format",
"iopub.status.busy": "BAD: should be ISO 8601 format",
"shell.execute_reply": "BAD: should be ISO 8601 format",
"iopub.status.idle": "BAD: should be ISO 8601 format"
}
},
"outputs": [],
"source": [
"\"foo 2\""
]
}
],
"metadata": {
"kernelspec": "BAD: should be a dict",
"authors": "BAD: should be a list of dicts"
},
"nbformat": 4,
"nbformat_minor": 5
}
45 changes: 39 additions & 6 deletions nbformat/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
import os
import pprint
import sys
import warnings

from ._imports import import_item
from .json_compat import get_current_validator, ValidationError
from .reader import get_version, reads
from .reader import get_version
from .corpus.words import generate_corpus_id

from traitlets.log import get_logger
Expand Down Expand Up @@ -233,7 +232,8 @@ def better_validation_error(error, version, version_minor):


def validate(nbdict=None, ref=None, version=None, version_minor=None,
relax_add_props=False, nbjson=None, repair_duplicate_cell_ids=True):
relax_add_props=False, nbjson=None, repair_duplicate_cell_ids=True,
strip_invalid_metadata=False):
"""Checks whether the given notebook dict-like object
conforms to the relevant notebook format schema.
Expand Down Expand Up @@ -271,7 +271,8 @@ def validate(nbdict=None, ref=None, version=None, version_minor=None,

for error in iter_validate(nbdict, ref=ref, version=version,
version_minor=version_minor,
relax_add_props=relax_add_props):
relax_add_props=relax_add_props,
strip_invalid_metadata=strip_invalid_metadata):
raise error

if notebook_supports_cell_ids:
Expand All @@ -290,7 +291,7 @@ def validate(nbdict=None, ref=None, version=None, version_minor=None,


def iter_validate(nbdict=None, ref=None, version=None, version_minor=None,
relax_add_props=False, nbjson=None):
relax_add_props=False, nbjson=None, strip_invalid_metadata=False):
"""Checks whether the given notebook dict-like object conforms to the
relevant notebook format schema.
Expand All @@ -317,7 +318,39 @@ def iter_validate(nbdict=None, ref=None, version=None, version_minor=None,
if ref:
errors = validator.iter_errors(nbdict, {'$ref' : '#/definitions/%s' % ref})
else:
errors = validator.iter_errors(nbdict)
errors = [e for e in validator.iter_errors(nbdict)]

if len(errors) > 0 and strip_invalid_metadata:
error_tree = validator.error_tree(errors)
if "metadata" in error_tree:
for key in error_tree["metadata"]:
nbdict["metadata"].pop(key, None)

if "cells" in error_tree:
number_of_cells = len(nbdict.get("cells", 0))
for cell_idx in range(number_of_cells):
# Cells don't report individual metadata keys as having failed validation
# Instead it reports that it failed to validate against each cell-type definition.
# We have to delve into why those definitions failed to uncover which metadata
# keys are misbehaving.
if "oneOf" in error_tree["cells"][cell_idx].errors:
intended_cell_type = nbdict["cells"][cell_idx]["cell_type"]
schemas_by_index = [ref["$ref"] for ref in error_tree["cells"][cell_idx].errors["oneOf"].schema["oneOf"]]
cell_type_definition_name = f"#/definitions/{intended_cell_type}_cell"
if cell_type_definition_name in schemas_by_index:
schema_index = schemas_by_index.index(cell_type_definition_name)
for error in error_tree["cells"][cell_idx].errors["oneOf"].context:
rel_path = error.relative_path
error_for_intended_schema = error.schema_path[0] == schema_index
is_top_level_metadata_key = len(rel_path) == 2 and rel_path[0] == "metadata"
if error_for_intended_schema and is_top_level_metadata_key:
nbdict["cells"][cell_idx]["metadata"].pop(rel_path[1], None)

# Validate one more time to ensure that us removing metadata
# didn't cause another complex validation issue in the schema.
# Also to ensure that higher-level errors produced by individual metadata validation
# failures are removed.
errors = validator.iter_errors(nbdict)

for error in errors:
yield better_validation_error(error, version, version_minor)

0 comments on commit 59e927d

Please sign in to comment.